lsmpp commited on
Commit
d57e24e
·
verified ·
1 Parent(s): 642ad6a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +248 -0
  2. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc +0 -0
  3. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc +0 -0
  4. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc +0 -0
  5. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc +0 -0
  6. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc +0 -0
  7. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc +0 -0
  8. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc +0 -0
  9. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc +0 -0
  10. .venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc +0 -0
  11. .venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py +0 -0
  12. .venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc +0 -0
  13. .venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc +0 -0
  14. .venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc +0 -0
  15. .venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py +22 -0
  16. .venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py +42 -0
  17. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h +68 -0
  18. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h +669 -0
  19. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h +669 -0
  20. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h +60 -0
  21. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h +60 -0
  22. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h +693 -0
  23. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h +693 -0
  24. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h +992 -0
  25. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h +992 -0
  26. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h +1316 -0
  27. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h +1316 -0
  28. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h +68 -0
  29. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h +70 -0
  30. .venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h +70 -0
  31. .venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt +39 -0
  32. .venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py +54 -0
  33. .venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so +0 -0
  34. .venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx +2 -0
  35. .venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build +6 -0
  36. .venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc +0 -0
  37. .venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc +0 -0
  38. .venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc +0 -0
  39. .venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc +0 -0
  40. .venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc +0 -0
  41. .venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc +0 -0
  42. .venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py +0 -0
  43. .venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py +62 -0
  44. .venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py +16 -0
  45. .venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py +33 -0
  46. .venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd +101 -0
  47. .venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp +1505 -0
  48. .venv/lib/python3.12/site-packages/sklearn/_loss/link.py +282 -0
  49. .venv/lib/python3.12/site-packages/sklearn/_loss/loss.py +1181 -0
  50. .venv/lib/python3.12/site-packages/sklearn/_loss/meson.build +23 -0
.gitattributes CHANGED
@@ -809,3 +809,251 @@ illustrious_generated/3e2afaad2b7d.png filter=lfs diff=lfs merge=lfs -text
809
  illustrious_generated/04d6bfa98264.png filter=lfs diff=lfs merge=lfs -text
810
  illustrious_generated/62a8fa0ac7dd.png filter=lfs diff=lfs merge=lfs -text
811
  illustrious_generated/d190d03f64a7.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  illustrious_generated/04d6bfa98264.png filter=lfs diff=lfs merge=lfs -text
810
  illustrious_generated/62a8fa0ac7dd.png filter=lfs diff=lfs merge=lfs -text
811
  illustrious_generated/d190d03f64a7.png filter=lfs diff=lfs merge=lfs -text
812
+ illustrious_generated/f6342e8db68a.png filter=lfs diff=lfs merge=lfs -text
813
+ illustrious_generated/f7ca451e1933.png filter=lfs diff=lfs merge=lfs -text
814
+ illustrious_generated/6b3c44df8332.png filter=lfs diff=lfs merge=lfs -text
815
+ illustrious_generated/ed13e74032fb.png filter=lfs diff=lfs merge=lfs -text
816
+ illustrious_generated/faa1e7049117.png filter=lfs diff=lfs merge=lfs -text
817
+ illustrious_generated/c17212cc7fda.png filter=lfs diff=lfs merge=lfs -text
818
+ illustrious_generated/6c268f463a2b.png filter=lfs diff=lfs merge=lfs -text
819
+ illustrious_generated/a364591ba4c1.png filter=lfs diff=lfs merge=lfs -text
820
+ illustrious_generated/2ea3ba7918b4.png filter=lfs diff=lfs merge=lfs -text
821
+ illustrious_generated/2ffb09f5cbc0.png filter=lfs diff=lfs merge=lfs -text
822
+ illustrious_generated/0d55065059c0.png filter=lfs diff=lfs merge=lfs -text
823
+ illustrious_generated/85e9723ae8cf.png filter=lfs diff=lfs merge=lfs -text
824
+ illustrious_generated/e89ab638d462.png filter=lfs diff=lfs merge=lfs -text
825
+ illustrious_generated/224c2084abb8.png filter=lfs diff=lfs merge=lfs -text
826
+ illustrious_generated/0b77d88bc5f0.png filter=lfs diff=lfs merge=lfs -text
827
+ illustrious_generated/91076903bce5.png filter=lfs diff=lfs merge=lfs -text
828
+ illustrious_generated/7acda55248bc.png filter=lfs diff=lfs merge=lfs -text
829
+ illustrious_generated/ee32c9618a12.png filter=lfs diff=lfs merge=lfs -text
830
+ illustrious_generated/698a4bf05f13.png filter=lfs diff=lfs merge=lfs -text
831
+ illustrious_generated/bf97f1eaffeb.png filter=lfs diff=lfs merge=lfs -text
832
+ illustrious_generated/62daa562132c.png filter=lfs diff=lfs merge=lfs -text
833
+ illustrious_generated/9ee7e057c8a2.png filter=lfs diff=lfs merge=lfs -text
834
+ illustrious_generated/427d956c743b.png filter=lfs diff=lfs merge=lfs -text
835
+ illustrious_generated/06da7f820423.png filter=lfs diff=lfs merge=lfs -text
836
+ illustrious_generated/92bcab0aaba1.png filter=lfs diff=lfs merge=lfs -text
837
+ illustrious_generated/502a84449b45.png filter=lfs diff=lfs merge=lfs -text
838
+ illustrious_generated/d99abaed93ba.png filter=lfs diff=lfs merge=lfs -text
839
+ illustrious_generated/3a12bf82c05e.png filter=lfs diff=lfs merge=lfs -text
840
+ illustrious_generated/433a115b55a3.png filter=lfs diff=lfs merge=lfs -text
841
+ illustrious_generated/574012fe8664.png filter=lfs diff=lfs merge=lfs -text
842
+ illustrious_generated/7d22dc2a6fb2.png filter=lfs diff=lfs merge=lfs -text
843
+ illustrious_generated/4f23c350b644.png filter=lfs diff=lfs merge=lfs -text
844
+ illustrious_generated/e24085ea542f.png filter=lfs diff=lfs merge=lfs -text
845
+ illustrious_generated/3cc7f3366f7a.png filter=lfs diff=lfs merge=lfs -text
846
+ illustrious_generated/5242430c6777.png filter=lfs diff=lfs merge=lfs -text
847
+ illustrious_generated/6fe5f96649a3.png filter=lfs diff=lfs merge=lfs -text
848
+ illustrious_generated/12875eda15eb.png filter=lfs diff=lfs merge=lfs -text
849
+ illustrious_generated/eac29190186c.png filter=lfs diff=lfs merge=lfs -text
850
+ illustrious_generated/c1276a9fc21b.png filter=lfs diff=lfs merge=lfs -text
851
+ illustrious_generated/a891e5d92031.png filter=lfs diff=lfs merge=lfs -text
852
+ illustrious_generated/0367ba694b76.png filter=lfs diff=lfs merge=lfs -text
853
+ illustrious_generated/f84f116882be.png filter=lfs diff=lfs merge=lfs -text
854
+ illustrious_generated/b8e81c1a4bd1.png filter=lfs diff=lfs merge=lfs -text
855
+ illustrious_generated/392a7a129a01.png filter=lfs diff=lfs merge=lfs -text
856
+ illustrious_generated/1506e01a5598.png filter=lfs diff=lfs merge=lfs -text
857
+ illustrious_generated/cbd5827b38ea.png filter=lfs diff=lfs merge=lfs -text
858
+ illustrious_generated/b80b59fe722f.png filter=lfs diff=lfs merge=lfs -text
859
+ illustrious_generated/a2ca03055273.png filter=lfs diff=lfs merge=lfs -text
860
+ illustrious_generated/b58cf17494db.png filter=lfs diff=lfs merge=lfs -text
861
+ illustrious_generated/4c587778617b.png filter=lfs diff=lfs merge=lfs -text
862
+ illustrious_generated/7c5200560049.png filter=lfs diff=lfs merge=lfs -text
863
+ illustrious_generated/b78d0c1f0687.png filter=lfs diff=lfs merge=lfs -text
864
+ illustrious_generated/5c6f22f08540.png filter=lfs diff=lfs merge=lfs -text
865
+ illustrious_generated/9b2b12c21a2b.png filter=lfs diff=lfs merge=lfs -text
866
+ illustrious_generated/ec96a311c2cb.png filter=lfs diff=lfs merge=lfs -text
867
+ illustrious_generated/a28e4715fc8c.png filter=lfs diff=lfs merge=lfs -text
868
+ illustrious_generated/00f5e16a2236.png filter=lfs diff=lfs merge=lfs -text
869
+ illustrious_generated/0ef8c1ed2c6c.png filter=lfs diff=lfs merge=lfs -text
870
+ illustrious_generated/f214facc5681.png filter=lfs diff=lfs merge=lfs -text
871
+ illustrious_generated/f41b4fc2c7d5.png filter=lfs diff=lfs merge=lfs -text
872
+ illustrious_generated/9e9a0ce3d676.png filter=lfs diff=lfs merge=lfs -text
873
+ illustrious_generated/26d2ef2d7d03.png filter=lfs diff=lfs merge=lfs -text
874
+ illustrious_generated/1e774fcc188d.png filter=lfs diff=lfs merge=lfs -text
875
+ illustrious_generated/7eab3f4f0c8e.png filter=lfs diff=lfs merge=lfs -text
876
+ illustrious_generated/f8631de95d70.png filter=lfs diff=lfs merge=lfs -text
877
+ illustrious_generated/8d95e57fcb27.png filter=lfs diff=lfs merge=lfs -text
878
+ illustrious_generated/7ac791baad53.png filter=lfs diff=lfs merge=lfs -text
879
+ illustrious_generated/7b8529c066a0.png filter=lfs diff=lfs merge=lfs -text
880
+ illustrious_generated/7d8509931e4e.png filter=lfs diff=lfs merge=lfs -text
881
+ illustrious_generated/9fafd1175b72.png filter=lfs diff=lfs merge=lfs -text
882
+ illustrious_generated/7023242de1c0.png filter=lfs diff=lfs merge=lfs -text
883
+ illustrious_generated/99d5b088ccd4.png filter=lfs diff=lfs merge=lfs -text
884
+ illustrious_generated/2bac6ab4413e.png filter=lfs diff=lfs merge=lfs -text
885
+ illustrious_generated/00ff6449b55d.png filter=lfs diff=lfs merge=lfs -text
886
+ illustrious_generated/7b900f6e27b1.png filter=lfs diff=lfs merge=lfs -text
887
+ illustrious_generated/69e10254baf5.png filter=lfs diff=lfs merge=lfs -text
888
+ illustrious_generated/93d9e9abc98e.png filter=lfs diff=lfs merge=lfs -text
889
+ illustrious_generated/095dc81d1160.png filter=lfs diff=lfs merge=lfs -text
890
+ illustrious_generated/3315198d28df.png filter=lfs diff=lfs merge=lfs -text
891
+ illustrious_generated/2549abad7eff.png filter=lfs diff=lfs merge=lfs -text
892
+ illustrious_generated/8a90db3476ef.png filter=lfs diff=lfs merge=lfs -text
893
+ illustrious_generated/72473c769552.png filter=lfs diff=lfs merge=lfs -text
894
+ illustrious_generated/bbf3fb096202.png filter=lfs diff=lfs merge=lfs -text
895
+ illustrious_generated/c5e0eb8a2241.png filter=lfs diff=lfs merge=lfs -text
896
+ illustrious_generated/8fa96985fc06.png filter=lfs diff=lfs merge=lfs -text
897
+ illustrious_generated/645e3b996530.png filter=lfs diff=lfs merge=lfs -text
898
+ illustrious_generated/b9fdc64b985c.png filter=lfs diff=lfs merge=lfs -text
899
+ illustrious_generated/fa67e15ca2bf.png filter=lfs diff=lfs merge=lfs -text
900
+ illustrious_generated/9f5c49f2e362.png filter=lfs diff=lfs merge=lfs -text
901
+ illustrious_generated/e8318516b273.png filter=lfs diff=lfs merge=lfs -text
902
+ illustrious_generated/e801a5ce2da6.png filter=lfs diff=lfs merge=lfs -text
903
+ illustrious_generated/cd9145683d1e.png filter=lfs diff=lfs merge=lfs -text
904
+ illustrious_generated/275253c8ad6b.png filter=lfs diff=lfs merge=lfs -text
905
+ illustrious_generated/f2a6e0c5c432.png filter=lfs diff=lfs merge=lfs -text
906
+ illustrious_generated/586dbda7c6ff.png filter=lfs diff=lfs merge=lfs -text
907
+ illustrious_generated/dff506d177c0.png filter=lfs diff=lfs merge=lfs -text
908
+ illustrious_generated/c8846919f3a8.png filter=lfs diff=lfs merge=lfs -text
909
+ illustrious_generated/afbdb8dce1e5.png filter=lfs diff=lfs merge=lfs -text
910
+ illustrious_generated/fd4c46f2141f.png filter=lfs diff=lfs merge=lfs -text
911
+ illustrious_generated/ee36cea22c91.png filter=lfs diff=lfs merge=lfs -text
912
+ illustrious_generated/6ca60a86b836.png filter=lfs diff=lfs merge=lfs -text
913
+ illustrious_generated/11c7f55b2aab.png filter=lfs diff=lfs merge=lfs -text
914
+ illustrious_generated/d684bc0d0627.png filter=lfs diff=lfs merge=lfs -text
915
+ illustrious_generated/4f1602c01d5b.png filter=lfs diff=lfs merge=lfs -text
916
+ illustrious_generated/45c709323899.png filter=lfs diff=lfs merge=lfs -text
917
+ illustrious_generated/d7bc7c5ba632.png filter=lfs diff=lfs merge=lfs -text
918
+ illustrious_generated/0e0acc59ef85.png filter=lfs diff=lfs merge=lfs -text
919
+ illustrious_generated/1c7a7ed6f359.png filter=lfs diff=lfs merge=lfs -text
920
+ illustrious_generated/31cbd66704bb.png filter=lfs diff=lfs merge=lfs -text
921
+ illustrious_generated/dd8a48931525.png filter=lfs diff=lfs merge=lfs -text
922
+ illustrious_generated/7368d4c82b5f.png filter=lfs diff=lfs merge=lfs -text
923
+ illustrious_generated/c7e1a60c0f5d.png filter=lfs diff=lfs merge=lfs -text
924
+ illustrious_generated/be56d67f1e08.png filter=lfs diff=lfs merge=lfs -text
925
+ illustrious_generated/269ee6e9a79c.png filter=lfs diff=lfs merge=lfs -text
926
+ illustrious_generated/2bb0e99b92bc.png filter=lfs diff=lfs merge=lfs -text
927
+ illustrious_generated/afd28993674d.png filter=lfs diff=lfs merge=lfs -text
928
+ illustrious_generated/585afc2017e2.png filter=lfs diff=lfs merge=lfs -text
929
+ illustrious_generated/f9c5bdc8bef5.png filter=lfs diff=lfs merge=lfs -text
930
+ illustrious_generated/8f338d47820a.png filter=lfs diff=lfs merge=lfs -text
931
+ illustrious_generated/e0443895d658.png filter=lfs diff=lfs merge=lfs -text
932
+ illustrious_generated/67ea9c16fed3.png filter=lfs diff=lfs merge=lfs -text
933
+ illustrious_generated/78dfdb4f0521.png filter=lfs diff=lfs merge=lfs -text
934
+ illustrious_generated/fff7c0390e8a.png filter=lfs diff=lfs merge=lfs -text
935
+ illustrious_generated/c63799030196.png filter=lfs diff=lfs merge=lfs -text
936
+ illustrious_generated/fc061ac787c7.png filter=lfs diff=lfs merge=lfs -text
937
+ illustrious_generated/26185801988b.png filter=lfs diff=lfs merge=lfs -text
938
+ illustrious_generated/656abae8d0b6.png filter=lfs diff=lfs merge=lfs -text
939
+ illustrious_generated/5c4a2ea8f842.png filter=lfs diff=lfs merge=lfs -text
940
+ illustrious_generated/2286bf835a6b.png filter=lfs diff=lfs merge=lfs -text
941
+ illustrious_generated/dc7501a6f47f.png filter=lfs diff=lfs merge=lfs -text
942
+ illustrious_generated/38b5363061d5.png filter=lfs diff=lfs merge=lfs -text
943
+ illustrious_generated/451e48977b1a.png filter=lfs diff=lfs merge=lfs -text
944
+ illustrious_generated/f7621703575c.png filter=lfs diff=lfs merge=lfs -text
945
+ illustrious_generated/891dc839571c.png filter=lfs diff=lfs merge=lfs -text
946
+ illustrious_generated/d1e30fd687b5.png filter=lfs diff=lfs merge=lfs -text
947
+ illustrious_generated/d1413371999b.png filter=lfs diff=lfs merge=lfs -text
948
+ illustrious_generated/0ad3307ea09c.png filter=lfs diff=lfs merge=lfs -text
949
+ illustrious_generated/6fba429dafc5.png filter=lfs diff=lfs merge=lfs -text
950
+ illustrious_generated/481f3834876a.png filter=lfs diff=lfs merge=lfs -text
951
+ illustrious_generated/1e54c0c78134.png filter=lfs diff=lfs merge=lfs -text
952
+ illustrious_generated/a564e408f362.png filter=lfs diff=lfs merge=lfs -text
953
+ illustrious_generated/ec6650b62802.png filter=lfs diff=lfs merge=lfs -text
954
+ illustrious_generated/9f447e4cf3d7.png filter=lfs diff=lfs merge=lfs -text
955
+ illustrious_generated/790ece21df10.png filter=lfs diff=lfs merge=lfs -text
956
+ illustrious_generated/75e576f27cb6.png filter=lfs diff=lfs merge=lfs -text
957
+ illustrious_generated/205b715d279f.png filter=lfs diff=lfs merge=lfs -text
958
+ illustrious_generated/060e926dcc0a.png filter=lfs diff=lfs merge=lfs -text
959
+ illustrious_generated/733c86338921.png filter=lfs diff=lfs merge=lfs -text
960
+ illustrious_generated/b9f37572031b.png filter=lfs diff=lfs merge=lfs -text
961
+ illustrious_generated/43eeb1fb403b.png filter=lfs diff=lfs merge=lfs -text
962
+ illustrious_generated/d22ef7243fac.png filter=lfs diff=lfs merge=lfs -text
963
+ illustrious_generated/162e3face5a7.png filter=lfs diff=lfs merge=lfs -text
964
+ illustrious_generated/765bf9d23c7e.png filter=lfs diff=lfs merge=lfs -text
965
+ illustrious_generated/47418c15a58f.png filter=lfs diff=lfs merge=lfs -text
966
+ illustrious_generated/3030bee9df5a.png filter=lfs diff=lfs merge=lfs -text
967
+ illustrious_generated/e4acb93d313c.png filter=lfs diff=lfs merge=lfs -text
968
+ illustrious_generated/08e454ab01c2.png filter=lfs diff=lfs merge=lfs -text
969
+ illustrious_generated/3f43e650c7d7.png filter=lfs diff=lfs merge=lfs -text
970
+ illustrious_generated/085929212457.png filter=lfs diff=lfs merge=lfs -text
971
+ illustrious_generated/91d346543b7c.png filter=lfs diff=lfs merge=lfs -text
972
+ illustrious_generated/891abd7c9fa3.png filter=lfs diff=lfs merge=lfs -text
973
+ illustrious_generated/1927adcb399a.png filter=lfs diff=lfs merge=lfs -text
974
+ illustrious_generated/7e49e6b5a30b.png filter=lfs diff=lfs merge=lfs -text
975
+ illustrious_generated/2cd36314054f.png filter=lfs diff=lfs merge=lfs -text
976
+ illustrious_generated/b569d3590c66.png filter=lfs diff=lfs merge=lfs -text
977
+ illustrious_generated/9e8dc59217e8.png filter=lfs diff=lfs merge=lfs -text
978
+ illustrious_generated/c2c3bea0e9d5.png filter=lfs diff=lfs merge=lfs -text
979
+ illustrious_generated/05972b153525.png filter=lfs diff=lfs merge=lfs -text
980
+ illustrious_generated/c9bf921e364a.png filter=lfs diff=lfs merge=lfs -text
981
+ illustrious_generated/13cdedc9c525.png filter=lfs diff=lfs merge=lfs -text
982
+ illustrious_generated/d8641bfcdd46.png filter=lfs diff=lfs merge=lfs -text
983
+ illustrious_generated/34afbd2725c8.png filter=lfs diff=lfs merge=lfs -text
984
+ illustrious_generated/f0d97f98333f.png filter=lfs diff=lfs merge=lfs -text
985
+ illustrious_generated/76b2de1037cb.png filter=lfs diff=lfs merge=lfs -text
986
+ illustrious_generated/a370eb471cd7.png filter=lfs diff=lfs merge=lfs -text
987
+ illustrious_generated/f5ab32c63fb8.png filter=lfs diff=lfs merge=lfs -text
988
+ illustrious_generated/5718f8172842.png filter=lfs diff=lfs merge=lfs -text
989
+ illustrious_generated/b7f508ecce88.png filter=lfs diff=lfs merge=lfs -text
990
+ illustrious_generated/5f147d77f3ed.png filter=lfs diff=lfs merge=lfs -text
991
+ illustrious_generated/ac9d950baac7.png filter=lfs diff=lfs merge=lfs -text
992
+ illustrious_generated/8b674edb3a4e.png filter=lfs diff=lfs merge=lfs -text
993
+ illustrious_generated/8ad0a744de62.png filter=lfs diff=lfs merge=lfs -text
994
+ illustrious_generated/5b8f74bcc260.png filter=lfs diff=lfs merge=lfs -text
995
+ illustrious_generated/78026f131004.png filter=lfs diff=lfs merge=lfs -text
996
+ illustrious_generated/d305fe437c6f.png filter=lfs diff=lfs merge=lfs -text
997
+ illustrious_generated/7cce990ade4c.png filter=lfs diff=lfs merge=lfs -text
998
+ illustrious_generated/c76729f0f827.png filter=lfs diff=lfs merge=lfs -text
999
+ illustrious_generated/0706f94ebdc3.png filter=lfs diff=lfs merge=lfs -text
1000
+ illustrious_generated/22af9def0424.png filter=lfs diff=lfs merge=lfs -text
1001
+ illustrious_generated/43877698ad33.png filter=lfs diff=lfs merge=lfs -text
1002
+ illustrious_generated/5a0201bebc6d.png filter=lfs diff=lfs merge=lfs -text
1003
+ illustrious_generated/7ad096e9b528.png filter=lfs diff=lfs merge=lfs -text
1004
+ illustrious_generated/46edb49b5dbf.png filter=lfs diff=lfs merge=lfs -text
1005
+ illustrious_generated/bd65b176bfe6.png filter=lfs diff=lfs merge=lfs -text
1006
+ illustrious_generated/073f299a3b06.png filter=lfs diff=lfs merge=lfs -text
1007
+ illustrious_generated/fc885c9be9af.png filter=lfs diff=lfs merge=lfs -text
1008
+ illustrious_generated/bcfc32b88c98.png filter=lfs diff=lfs merge=lfs -text
1009
+ illustrious_generated/e55e6cf94025.png filter=lfs diff=lfs merge=lfs -text
1010
+ illustrious_generated/b4a9600f3647.png filter=lfs diff=lfs merge=lfs -text
1011
+ illustrious_generated/d7ef34bf47ee.png filter=lfs diff=lfs merge=lfs -text
1012
+ illustrious_generated/8cbc6e1dbe62.png filter=lfs diff=lfs merge=lfs -text
1013
+ illustrious_generated/8633a3dff7ea.png filter=lfs diff=lfs merge=lfs -text
1014
+ illustrious_generated/cb335826ba02.png filter=lfs diff=lfs merge=lfs -text
1015
+ illustrious_generated/3048ba382498.png filter=lfs diff=lfs merge=lfs -text
1016
+ illustrious_generated/eca43ddadd85.png filter=lfs diff=lfs merge=lfs -text
1017
+ illustrious_generated/365e7d0f97c2.png filter=lfs diff=lfs merge=lfs -text
1018
+ illustrious_generated/e71b25950c5d.png filter=lfs diff=lfs merge=lfs -text
1019
+ illustrious_generated/59a595c825c8.png filter=lfs diff=lfs merge=lfs -text
1020
+ illustrious_generated/82ee8177ef04.png filter=lfs diff=lfs merge=lfs -text
1021
+ illustrious_generated/36915299353b.png filter=lfs diff=lfs merge=lfs -text
1022
+ illustrious_generated/ca07713b354c.png filter=lfs diff=lfs merge=lfs -text
1023
+ illustrious_generated/fbebd175667e.png filter=lfs diff=lfs merge=lfs -text
1024
+ illustrious_generated/dacfbbcd3fb3.png filter=lfs diff=lfs merge=lfs -text
1025
+ illustrious_generated/8a371dac467c.png filter=lfs diff=lfs merge=lfs -text
1026
+ illustrious_generated/40c498965cbd.png filter=lfs diff=lfs merge=lfs -text
1027
+ illustrious_generated/190beb9306ef.png filter=lfs diff=lfs merge=lfs -text
1028
+ illustrious_generated/bb2041beb345.png filter=lfs diff=lfs merge=lfs -text
1029
+ illustrious_generated/6f1c05af41ca.png filter=lfs diff=lfs merge=lfs -text
1030
+ illustrious_generated/9f741bd68919.png filter=lfs diff=lfs merge=lfs -text
1031
+ illustrious_generated/9bb815cccb98.png filter=lfs diff=lfs merge=lfs -text
1032
+ illustrious_generated/41d42d8f4842.png filter=lfs diff=lfs merge=lfs -text
1033
+ illustrious_generated/13166cbea867.png filter=lfs diff=lfs merge=lfs -text
1034
+ illustrious_generated/e2812aff73e9.png filter=lfs diff=lfs merge=lfs -text
1035
+ illustrious_generated/954594f7f0a6.png filter=lfs diff=lfs merge=lfs -text
1036
+ illustrious_generated/c4b5bff2dbc1.png filter=lfs diff=lfs merge=lfs -text
1037
+ illustrious_generated/980b174e831c.png filter=lfs diff=lfs merge=lfs -text
1038
+ illustrious_generated/ed89a47fd589.png filter=lfs diff=lfs merge=lfs -text
1039
+ illustrious_generated/a8e5c9011eef.png filter=lfs diff=lfs merge=lfs -text
1040
+ illustrious_generated/f1de13ffcad6.png filter=lfs diff=lfs merge=lfs -text
1041
+ illustrious_generated/574fba2c6515.png filter=lfs diff=lfs merge=lfs -text
1042
+ illustrious_generated/591e156ad5fd.png filter=lfs diff=lfs merge=lfs -text
1043
+ illustrious_generated/aef907db00ce.png filter=lfs diff=lfs merge=lfs -text
1044
+ illustrious_generated/3967f8d787ab.png filter=lfs diff=lfs merge=lfs -text
1045
+ illustrious_generated/a1ec0d3b0b0e.png filter=lfs diff=lfs merge=lfs -text
1046
+ illustrious_generated/9da135f5f21e.png filter=lfs diff=lfs merge=lfs -text
1047
+ illustrious_generated/8fd9fbffb954.png filter=lfs diff=lfs merge=lfs -text
1048
+ illustrious_generated/24e5b9fe7d38.png filter=lfs diff=lfs merge=lfs -text
1049
+ illustrious_generated/495f1b55919f.png filter=lfs diff=lfs merge=lfs -text
1050
+ illustrious_generated/19ff2ce2a961.png filter=lfs diff=lfs merge=lfs -text
1051
+ illustrious_generated/e39fecdd2676.png filter=lfs diff=lfs merge=lfs -text
1052
+ illustrious_generated/7663094bacec.png filter=lfs diff=lfs merge=lfs -text
1053
+ illustrious_generated/6d5feb7de870.png filter=lfs diff=lfs merge=lfs -text
1054
+ illustrious_generated/abe90752beb0.png filter=lfs diff=lfs merge=lfs -text
1055
+ illustrious_generated/cae43d7fd0f8.png filter=lfs diff=lfs merge=lfs -text
1056
+ illustrious_generated/3f5c59c8ee7b.png filter=lfs diff=lfs merge=lfs -text
1057
+ illustrious_generated/49712a2e71f1.png filter=lfs diff=lfs merge=lfs -text
1058
+ illustrious_generated/6346f39915f3.png filter=lfs diff=lfs merge=lfs -text
1059
+ illustrious_generated/4c6ea9681419.png filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (56.9 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc ADDED
Binary file (5.93 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc ADDED
Binary file (1.98 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc ADDED
Binary file (2.26 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc ADDED
Binary file (3.49 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc ADDED
Binary file (6.49 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc ADDED
Binary file (3.58 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc ADDED
Binary file (3.89 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc ADDED
Binary file (399 Bytes). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py ADDED
File without changes
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (202 Bytes). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc ADDED
Binary file (1.27 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc ADDED
Binary file (1.71 kB). View file
 
.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import sys
4
+ import types
5
+
6
+
7
+ def wrap(path): # pragma: no cover
8
+ """
9
+ Workaround for https://github.com/python/cpython/issues/84538
10
+ to add backward compatibility for walk_up=True.
11
+ An example affected package is dask-labextension, which uses
12
+ jupyter-packaging to install JupyterLab javascript files outside
13
+ of site-packages.
14
+ """
15
+
16
+ def relative_to(root, *, walk_up=False):
17
+ return pathlib.Path(os.path.relpath(path, root))
18
+
19
+ return types.SimpleNamespace(relative_to=relative_to)
20
+
21
+
22
+ relative_fix = wrap if sys.version_info < (3, 12) else lambda x: x
.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Compatibility layer with Python 3.8/3.9
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING: # pragma: no cover
10
+ # Prevent circular imports on runtime.
11
+ from .. import Distribution, EntryPoint
12
+ else:
13
+ Distribution = EntryPoint = Any
14
+
15
+ from .._typing import md_none
16
+
17
+
18
+ def normalized_name(dist: Distribution) -> str | None:
19
+ """
20
+ Honor name normalization for distributions that don't provide ``_normalized_name``.
21
+ """
22
+ try:
23
+ return dist._normalized_name
24
+ except AttributeError:
25
+ from .. import Prepared # -> delay to prevent circular imports.
26
+
27
+ return Prepared.normalize(
28
+ getattr(dist, "name", None) or md_none(dist.metadata)['Name']
29
+ )
30
+
31
+
32
+ def ep_matches(ep: EntryPoint, **params) -> bool:
33
+ """
34
+ Workaround for ``EntryPoint`` objects without the ``matches`` method.
35
+ """
36
+ try:
37
+ return ep.matches(**params)
38
+ except AttributeError:
39
+ from .. import EntryPoint # -> delay to prevent circular imports.
40
+
41
+ # Reconstruct the EntryPoint object to make sure it is compatible.
42
+ return EntryPoint(ep.name, ep.value, ep.group).matches(**params)
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn : Neural Networks Library */
51
+
52
+ #if !defined(CUDNN_H_)
53
+ #define CUDNN_H_
54
+ #if defined(__cplusplus)
55
+ extern "C" {
56
+ #endif
57
+
58
+ #include <cuda_runtime_api.h>
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_graph.h"
61
+ #include "cudnn_ops.h"
62
+ #include "cudnn_adv.h"
63
+ #include "cudnn_cnn.h"
64
+
65
+ #if defined(__cplusplus)
66
+ }
67
+ #endif
68
+ #endif /* CUDNN_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn_adv : cuDNN's advanced and experimental features.
51
+
52
+ */
53
+
54
+ #if !defined(CUDNN_ADV_H_)
55
+ #define CUDNN_ADV_H_
56
+
57
+ #include <stdint.h>
58
+
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_ops.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_ADV_MAJOR 9
64
+ #define CUDNN_ADV_MINOR 10
65
+ #define CUDNN_ADV_PATCH 2
66
+
67
+ #if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
68
+ #error Version mismatch in cuDNN ADV INFER!!!
69
+ #endif
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif
74
+
75
+ /* BASIC RNN API */
76
+
77
+ typedef enum {
78
+ CUDNN_RNN_ALGO_STANDARD = 0,
79
+ CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
80
+ CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
81
+ CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
82
+ CUDNN_RNN_ALGO_COUNT = 4,
83
+ } cudnnRNNAlgo_t;
84
+
85
+ typedef enum {
86
+ CUDNN_FWD_MODE_INFERENCE = 0,
87
+ CUDNN_FWD_MODE_TRAINING = 1,
88
+ } cudnnForwardMode_t;
89
+
90
+ typedef enum {
91
+ CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
92
+ CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
93
+ CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */
94
+ CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
95
+ } cudnnRNNMode_t;
96
+
97
+ typedef enum {
98
+ CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */
99
+ CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
100
+ CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */
101
+ CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */
102
+ } cudnnRNNBiasMode_t;
103
+
104
+ typedef enum {
105
+ CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
106
+ CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */
107
+ } cudnnDirectionMode_t;
108
+
109
+ typedef enum {
110
+ CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
111
+ CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */
112
+ } cudnnRNNInputMode_t;
113
+
114
+ typedef enum {
115
+ CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */
116
+ CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
117
+ } cudnnRNNClipMode_t;
118
+
119
+ typedef enum {
120
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */
121
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */
122
+ CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
123
+ } cudnnRNNDataLayout_t;
124
+
125
+ /* For auxFlags in cudnnSetRNNDescriptor_v8() */
126
+ #define CUDNN_RNN_PADDED_IO_DISABLED 0
127
+ #define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
128
+
129
+ struct cudnnRNNStruct;
130
+ typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
131
+
132
+ struct cudnnRNNDataStruct;
133
+ typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
134
+
135
+ cudnnStatus_t CUDNNWINAPI
136
+ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
137
+
138
+ cudnnStatus_t CUDNNWINAPI
139
+ cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
140
+
141
+ /*
142
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
143
+ * Compute precision is further modified by mathType that sets the
144
+ * preferred option for using NVIDIA Tensor Cores. dataType specify
145
+ * input/output data type and weight/bias type.
146
+ */
147
+
148
+ cudnnStatus_t CUDNNWINAPI
149
+ cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
150
+ cudnnRNNAlgo_t algo,
151
+ cudnnRNNMode_t cellMode,
152
+ cudnnRNNBiasMode_t biasMode,
153
+ cudnnDirectionMode_t dirMode,
154
+ cudnnRNNInputMode_t inputMode,
155
+ cudnnDataType_t dataType,
156
+ cudnnDataType_t mathPrec,
157
+ cudnnMathType_t mathType,
158
+ int32_t inputSize,
159
+ int32_t hiddenSize,
160
+ int32_t projSize,
161
+ int32_t numLayers,
162
+ cudnnDropoutDescriptor_t dropoutDesc,
163
+ uint32_t auxFlags);
164
+
165
+ cudnnStatus_t CUDNNWINAPI
166
+ cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
167
+ cudnnRNNAlgo_t *algo,
168
+ cudnnRNNMode_t *cellMode,
169
+ cudnnRNNBiasMode_t *biasMode,
170
+ cudnnDirectionMode_t *dirMode,
171
+ cudnnRNNInputMode_t *inputMode,
172
+ cudnnDataType_t *dataType,
173
+ cudnnDataType_t *mathPrec,
174
+ cudnnMathType_t *mathType,
175
+ int32_t *inputSize,
176
+ int32_t *hiddenSize,
177
+ int32_t *projSize,
178
+ int32_t *numLayers,
179
+ cudnnDropoutDescriptor_t *dropoutDesc,
180
+ uint32_t *auxFlags);
181
+
182
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
183
+ cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
184
+ cudnnRNNClipMode_t clipMode,
185
+ cudnnNanPropagation_t clipNanOpt,
186
+ double lclip,
187
+ double rclip);
188
+
189
+ cudnnStatus_t CUDNNWINAPI
190
+ cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
191
+
192
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
193
+ cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
194
+ cudnnRNNClipMode_t *clipMode,
195
+ cudnnNanPropagation_t *clipNanOpt,
196
+ double *lclip,
197
+ double *rclip);
198
+
199
+ cudnnStatus_t CUDNNWINAPI
200
+ cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
201
+
202
+ cudnnStatus_t CUDNNWINAPI
203
+ cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
204
+
205
+ cudnnStatus_t CUDNNWINAPI
206
+ cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
207
+ cudnnRNNDescriptor_t rnnDesc,
208
+ cudnnForwardMode_t fwdMode,
209
+ cudnnRNNDataDescriptor_t xDesc,
210
+ size_t *workSpaceSize,
211
+ size_t *reserveSpaceSize);
212
+
213
+ cudnnStatus_t CUDNNWINAPI
214
+ cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
215
+
216
+ cudnnStatus_t CUDNNWINAPI
217
+ cudnnGetRNNWeightParams(cudnnHandle_t handle,
218
+ cudnnRNNDescriptor_t rnnDesc,
219
+ int32_t pseudoLayer,
220
+ size_t weightSpaceSize,
221
+ const void *weightSpace,
222
+ int32_t linLayerID,
223
+ cudnnTensorDescriptor_t mDesc,
224
+ void **mAddr,
225
+ cudnnTensorDescriptor_t bDesc,
226
+ void **bAddr);
227
+
228
+ cudnnStatus_t CUDNNWINAPI
229
+ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
230
+
231
+ cudnnStatus_t CUDNNWINAPI
232
+ cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
233
+
234
+ cudnnStatus_t CUDNNWINAPI
235
+ cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
236
+ cudnnDataType_t dataType,
237
+ cudnnRNNDataLayout_t layout,
238
+ int maxSeqLength,
239
+ int batchSize,
240
+ int vectorSize,
241
+ const int seqLengthArray[], /* length of each sequence in the batch */
242
+ void *paddingFill); /* symbol for filling padding position in output */
243
+
244
+ cudnnStatus_t CUDNNWINAPI
245
+ cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
246
+ cudnnDataType_t *dataType,
247
+ cudnnRNNDataLayout_t *layout,
248
+ int *maxSeqLength,
249
+ int *batchSize,
250
+ int *vectorSize,
251
+ int arrayLengthRequested,
252
+ int seqLengthArray[],
253
+ void *paddingFill);
254
+
255
+ cudnnStatus_t CUDNNWINAPI
256
+ cudnnRNNForward(cudnnHandle_t handle,
257
+ cudnnRNNDescriptor_t rnnDesc,
258
+ cudnnForwardMode_t fwdMode,
259
+ const int32_t devSeqLengths[],
260
+ cudnnRNNDataDescriptor_t xDesc,
261
+ const void *x,
262
+ cudnnRNNDataDescriptor_t yDesc,
263
+ void *y,
264
+ cudnnTensorDescriptor_t hDesc,
265
+ const void *hx,
266
+ void *hy,
267
+ cudnnTensorDescriptor_t cDesc,
268
+ const void *cx,
269
+ void *cy,
270
+ size_t weightSpaceSize,
271
+ const void *weightSpace,
272
+ size_t workSpaceSize,
273
+ void *workSpace,
274
+ size_t reserveSpaceSize,
275
+ void *reserveSpace);
276
+
277
+ /* Sequence data descriptor */
278
+
279
+ typedef enum {
280
+ CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */
281
+ CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
282
+ CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */
283
+ CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */
284
+ } cudnnSeqDataAxis_t;
285
+
286
+ struct cudnnSeqDataStruct;
287
+ typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
288
+
289
+ #define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
290
+
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
293
+
294
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
295
+ cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
296
+
297
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
298
+ cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
299
+ cudnnDataType_t dataType,
300
+ int nbDims,
301
+ const int dimA[],
302
+ const cudnnSeqDataAxis_t axes[],
303
+ size_t seqLengthArraySize,
304
+ const int seqLengthArray[],
305
+ void *paddingFill);
306
+
307
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
308
+ cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
309
+ cudnnDataType_t *dataType,
310
+ int *nbDims,
311
+ int nbDimsRequested,
312
+ int dimA[],
313
+ cudnnSeqDataAxis_t axes[],
314
+ size_t *seqLengthArraySize,
315
+ size_t seqLengthSizeRequested,
316
+ int seqLengthArray[],
317
+ void *paddingFill);
318
+
319
+ /* Multihead Attention */
320
+
321
+ /*
322
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
323
+ * Use the bitwise OR operator to combine several settings listed below. Additional
324
+ * minor options can be added here w/o changing or introducing new API functions.
325
+ */
326
+ #define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */
327
+ #define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
328
+ #define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */
329
+ #define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */
330
+
331
+ struct cudnnAttnStruct;
332
+ typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
333
+
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
336
+
337
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
338
+ cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
339
+
340
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
341
+ cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
342
+ unsigned attnMode,
343
+ int nHeads,
344
+ double smScaler,
345
+ cudnnDataType_t dataType,
346
+ cudnnDataType_t computePrec,
347
+ cudnnMathType_t mathType,
348
+ cudnnDropoutDescriptor_t attnDropoutDesc,
349
+ cudnnDropoutDescriptor_t postDropoutDesc,
350
+ int qSize,
351
+ int kSize,
352
+ int vSize,
353
+ int qProjSize,
354
+ int kProjSize,
355
+ int vProjSize,
356
+ int oProjSize,
357
+ int qoMaxSeqLength,
358
+ int kvMaxSeqLength,
359
+ int maxBatchSize,
360
+ int maxBeamSize);
361
+
362
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
363
+ cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
364
+ unsigned *attnMode,
365
+ int *nHeads,
366
+ double *smScaler,
367
+ cudnnDataType_t *dataType,
368
+ cudnnDataType_t *computePrec,
369
+ cudnnMathType_t *mathType,
370
+ cudnnDropoutDescriptor_t *attnDropoutDesc,
371
+ cudnnDropoutDescriptor_t *postDropoutDesc,
372
+ int *qSize,
373
+ int *kSize,
374
+ int *vSize,
375
+ int *qProjSize,
376
+ int *kProjSize,
377
+ int *vProjSize,
378
+ int *oProjSize,
379
+ int *qoMaxSeqLength,
380
+ int *kvMaxSeqLength,
381
+ int *maxBatchSize,
382
+ int *maxBeamSize);
383
+
384
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
385
+ cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
386
+ const cudnnAttnDescriptor_t attnDesc,
387
+ size_t *weightSizeInBytes,
388
+ size_t *workSpaceSizeInBytes,
389
+ size_t *reserveSpaceSizeInBytes);
390
+
391
+ typedef enum {
392
+ CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
393
+ CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
394
+ CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
395
+ CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
396
+ CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */
397
+ CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */
398
+ CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */
399
+ CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */
400
+ } cudnnMultiHeadAttnWeightKind_t;
401
+
402
+ #define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
403
+
404
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
405
+ cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
406
+ const cudnnAttnDescriptor_t attnDesc,
407
+ cudnnMultiHeadAttnWeightKind_t wKind,
408
+ size_t weightSizeInBytes,
409
+ const void *weights,
410
+ cudnnTensorDescriptor_t wDesc,
411
+ void **wAddr);
412
+
413
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
414
+ cudnnMultiHeadAttnForward(cudnnHandle_t handle,
415
+ const cudnnAttnDescriptor_t attnDesc,
416
+ int currIdx,
417
+ const int loWinIdx[],
418
+ const int hiWinIdx[],
419
+ const int devSeqLengthsQO[],
420
+ const int devSeqLengthsKV[],
421
+ const cudnnSeqDataDescriptor_t qDesc,
422
+ const void *queries,
423
+ const void *residuals,
424
+ const cudnnSeqDataDescriptor_t kDesc,
425
+ const void *keys,
426
+ const cudnnSeqDataDescriptor_t vDesc,
427
+ const void *values,
428
+ const cudnnSeqDataDescriptor_t oDesc,
429
+ void *out,
430
+ size_t weightSizeInBytes,
431
+ const void *weights,
432
+ size_t workSpaceSizeInBytes,
433
+ void *workSpace,
434
+ size_t reserveSpaceSizeInBytes,
435
+ void *reserveSpace);
436
+
437
+ /*
438
+ * \brief Cross-library version checker.
439
+ * This function is implemented differently in each sub-library. Each sublib
440
+ * checks whether its own version matches that of its dependencies.
441
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
442
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
443
+ */
444
+ cudnnStatus_t CUDNNWINAPI
445
+ cudnnAdvVersionCheck(void);
446
+
447
+ typedef enum {
448
+ CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
449
+ CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
450
+ } cudnnWgradMode_t;
451
+
452
+ cudnnStatus_t CUDNNWINAPI
453
+ cudnnRNNBackwardData_v8(cudnnHandle_t handle,
454
+ cudnnRNNDescriptor_t rnnDesc,
455
+ const int32_t devSeqLengths[],
456
+ cudnnRNNDataDescriptor_t yDesc,
457
+ const void *y,
458
+ const void *dy,
459
+ cudnnRNNDataDescriptor_t xDesc,
460
+ void *dx,
461
+ cudnnTensorDescriptor_t hDesc,
462
+ const void *hx,
463
+ const void *dhy,
464
+ void *dhx,
465
+ cudnnTensorDescriptor_t cDesc,
466
+ const void *cx,
467
+ const void *dcy,
468
+ void *dcx,
469
+ size_t weightSpaceSize,
470
+ const void *weightSpace,
471
+ size_t workSpaceSize,
472
+ void *workSpace,
473
+ size_t reserveSpaceSize,
474
+ void *reserveSpace);
475
+
476
+ cudnnStatus_t CUDNNWINAPI
477
+ cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
478
+ cudnnRNNDescriptor_t rnnDesc,
479
+ cudnnWgradMode_t addGrad,
480
+ const int32_t devSeqLengths[],
481
+ cudnnRNNDataDescriptor_t xDesc,
482
+ const void *x,
483
+ cudnnTensorDescriptor_t hDesc,
484
+ const void *hx,
485
+ cudnnRNNDataDescriptor_t yDesc,
486
+ const void *y,
487
+ size_t weightSpaceSize,
488
+ void *dweightSpace,
489
+ size_t workSpaceSize,
490
+ void *workSpace,
491
+ size_t reserveSpaceSize,
492
+ void *reserveSpace);
493
+
494
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
495
+ cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
496
+ const cudnnAttnDescriptor_t attnDesc,
497
+ const int loWinIdx[],
498
+ const int hiWinIdx[],
499
+ const int devSeqLengthsDQDO[],
500
+ const int devSeqLengthsDKDV[],
501
+ const cudnnSeqDataDescriptor_t doDesc,
502
+ const void *dout,
503
+ const cudnnSeqDataDescriptor_t dqDesc,
504
+ void *dqueries,
505
+ const void *queries,
506
+ const cudnnSeqDataDescriptor_t dkDesc,
507
+ void *dkeys,
508
+ const void *keys,
509
+ const cudnnSeqDataDescriptor_t dvDesc,
510
+ void *dvalues,
511
+ const void *values,
512
+ size_t weightSizeInBytes,
513
+ const void *weights,
514
+ size_t workSpaceSizeInBytes,
515
+ void *workSpace,
516
+ size_t reserveSpaceSizeInBytes,
517
+ void *reserveSpace);
518
+
519
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
520
+ cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
521
+ const cudnnAttnDescriptor_t attnDesc,
522
+ cudnnWgradMode_t addGrad,
523
+ const cudnnSeqDataDescriptor_t qDesc,
524
+ const void *queries,
525
+ const cudnnSeqDataDescriptor_t kDesc,
526
+ const void *keys,
527
+ const cudnnSeqDataDescriptor_t vDesc,
528
+ const void *values,
529
+ const cudnnSeqDataDescriptor_t doDesc,
530
+ const void *dout,
531
+ size_t weightSizeInBytes,
532
+ const void *weights,
533
+ void *dweights,
534
+ size_t workSpaceSizeInBytes,
535
+ void *workSpace,
536
+ size_t reserveSpaceSizeInBytes,
537
+ void *reserveSpace);
538
+
539
+ /*
540
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
541
+ */
542
+ /* Input normalization mode for loss function */
543
+ typedef enum {
544
+ CUDNN_LOSS_NORMALIZATION_NONE = 0,
545
+ CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
546
+ } cudnnLossNormalizationMode_t;
547
+
548
+ cudnnStatus_t CUDNNWINAPI
549
+ cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
550
+
551
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
552
+ cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
553
+
554
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
555
+ cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
556
+ cudnnDataType_t compType,
557
+ cudnnLossNormalizationMode_t normMode,
558
+ cudnnNanPropagation_t gradMode);
559
+
560
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
561
+ cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
562
+ cudnnDataType_t compType,
563
+ cudnnLossNormalizationMode_t normMode,
564
+ cudnnNanPropagation_t gradMode,
565
+ int maxLabelLength);
566
+
567
+ cudnnStatus_t CUDNNWINAPI
568
+ cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
569
+ cudnnDataType_t compType,
570
+ cudnnLossNormalizationMode_t normMode,
571
+ cudnnCTCGradMode_t ctcGradMode,
572
+ int maxLabelLength);
573
+
574
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
575
+ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
576
+
577
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
578
+ cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
579
+ cudnnDataType_t *compType,
580
+ cudnnLossNormalizationMode_t *normMode,
581
+ cudnnNanPropagation_t *gradMode);
582
+
583
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
584
+ cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
585
+ cudnnDataType_t *compType,
586
+ cudnnLossNormalizationMode_t *normMode,
587
+ cudnnNanPropagation_t *gradMode,
588
+ int *maxLabelLength);
589
+
590
+ cudnnStatus_t CUDNNWINAPI
591
+ cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
592
+ cudnnDataType_t *compType,
593
+ cudnnLossNormalizationMode_t *normMode,
594
+ cudnnCTCGradMode_t *ctcGradMode,
595
+ int *maxLabelLength);
596
+
597
+ cudnnStatus_t CUDNNWINAPI
598
+ cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
599
+
600
+ /* return the ctc costs and gradients, given the probabilities and labels */
601
+ cudnnStatus_t CUDNNWINAPI
602
+ cudnnCTCLoss(
603
+ cudnnHandle_t handle,
604
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
605
+ timing steps, N is the mini batch size, A is the alphabet size) */
606
+ const void *probs, /* probabilities after softmax, in GPU memory */
607
+ const int hostLabels[], /* labels, in CPU memory */
608
+ const int hostLabelLengths[], /* the length of each label, in CPU memory */
609
+ const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */
610
+ void *costs, /* the returned costs of CTC, in GPU memory */
611
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
612
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
613
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
614
+ cudnnCTCLossDescriptor_t ctcLossDesc,
615
+ void *workspace, /* pointer to the workspace, in GPU memory */
616
+ size_t workSpaceSizeInBytes); /* size of the workspace */
617
+
618
+ /* return the ctc costs and gradients, given the probabilities and labels */
619
+ cudnnStatus_t CUDNNWINAPI
620
+ cudnnCTCLoss_v8(
621
+ cudnnHandle_t handle,
622
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
623
+ cudnnCTCLossDescriptor_t ctcLossDesc,
624
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
625
+ timing steps, N is the mini batch size, A is the alphabet size) */
626
+ const void *probs, /* probabilities after softmax, in GPU memory */
627
+ const int labels[], /* labels, in GPU memory */
628
+ const int labelLengths[], /* the length of each label, in GPU memory */
629
+ const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */
630
+ void *costs, /* the returned costs of CTC, in GPU memory */
631
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
632
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
633
+ size_t workSpaceSizeInBytes, /* size of the workspace */
634
+ void *workspace); /* pointer to the workspace, in GPU memory */
635
+
636
+ /* return the workspace size needed for ctc */
637
+ cudnnStatus_t CUDNNWINAPI
638
+ cudnnGetCTCLossWorkspaceSize(
639
+ cudnnHandle_t handle,
640
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
641
+ timing steps, N is the mini batch size, A is the alphabet size) */
642
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
643
+ dimensions are T,N,A. To compute costs
644
+ only, set it to NULL */
645
+ const int *labels, /* labels, in CPU memory */
646
+ const int *labelLengths, /* the length of each label, in CPU memory */
647
+ const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */
648
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
649
+ cudnnCTCLossDescriptor_t ctcLossDesc,
650
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
651
+
652
+ /* return the workspace size needed for ctc */
653
+ cudnnStatus_t CUDNNWINAPI
654
+ cudnnGetCTCLossWorkspaceSize_v8(
655
+ cudnnHandle_t handle,
656
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
657
+ cudnnCTCLossDescriptor_t ctcLossDesc,
658
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
659
+ timing steps, N is the mini batch size, A is the alphabet size) */
660
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
661
+ dimensions are T,N,A. To compute costs
662
+ only, set it to NULL */
663
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
664
+
665
+ #if defined(__cplusplus)
666
+ }
667
+ #endif
668
+
669
+ #endif /* CUDNN_ADV_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn_adv : cuDNN's advanced and experimental features.
51
+
52
+ */
53
+
54
+ #if !defined(CUDNN_ADV_H_)
55
+ #define CUDNN_ADV_H_
56
+
57
+ #include <stdint.h>
58
+
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_ops.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_ADV_MAJOR 9
64
+ #define CUDNN_ADV_MINOR 10
65
+ #define CUDNN_ADV_PATCH 2
66
+
67
+ #if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
68
+ #error Version mismatch in cuDNN ADV INFER!!!
69
+ #endif
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif
74
+
75
+ /* BASIC RNN API */
76
+
77
+ typedef enum {
78
+ CUDNN_RNN_ALGO_STANDARD = 0,
79
+ CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
80
+ CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
81
+ CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
82
+ CUDNN_RNN_ALGO_COUNT = 4,
83
+ } cudnnRNNAlgo_t;
84
+
85
+ typedef enum {
86
+ CUDNN_FWD_MODE_INFERENCE = 0,
87
+ CUDNN_FWD_MODE_TRAINING = 1,
88
+ } cudnnForwardMode_t;
89
+
90
+ typedef enum {
91
+ CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
92
+ CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
93
+ CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */
94
+ CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
95
+ } cudnnRNNMode_t;
96
+
97
+ typedef enum {
98
+ CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */
99
+ CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
100
+ CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */
101
+ CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */
102
+ } cudnnRNNBiasMode_t;
103
+
104
+ typedef enum {
105
+ CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
106
+ CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */
107
+ } cudnnDirectionMode_t;
108
+
109
+ typedef enum {
110
+ CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
111
+ CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */
112
+ } cudnnRNNInputMode_t;
113
+
114
+ typedef enum {
115
+ CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */
116
+ CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
117
+ } cudnnRNNClipMode_t;
118
+
119
+ typedef enum {
120
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */
121
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */
122
+ CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
123
+ } cudnnRNNDataLayout_t;
124
+
125
+ /* For auxFlags in cudnnSetRNNDescriptor_v8() */
126
+ #define CUDNN_RNN_PADDED_IO_DISABLED 0
127
+ #define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
128
+
129
+ struct cudnnRNNStruct;
130
+ typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
131
+
132
+ struct cudnnRNNDataStruct;
133
+ typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
134
+
135
+ cudnnStatus_t CUDNNWINAPI
136
+ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
137
+
138
+ cudnnStatus_t CUDNNWINAPI
139
+ cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
140
+
141
+ /*
142
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
143
+ * Compute precision is further modified by mathType that sets the
144
+ * preferred option for using NVIDIA Tensor Cores. dataType specify
145
+ * input/output data type and weight/bias type.
146
+ */
147
+
148
+ cudnnStatus_t CUDNNWINAPI
149
+ cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
150
+ cudnnRNNAlgo_t algo,
151
+ cudnnRNNMode_t cellMode,
152
+ cudnnRNNBiasMode_t biasMode,
153
+ cudnnDirectionMode_t dirMode,
154
+ cudnnRNNInputMode_t inputMode,
155
+ cudnnDataType_t dataType,
156
+ cudnnDataType_t mathPrec,
157
+ cudnnMathType_t mathType,
158
+ int32_t inputSize,
159
+ int32_t hiddenSize,
160
+ int32_t projSize,
161
+ int32_t numLayers,
162
+ cudnnDropoutDescriptor_t dropoutDesc,
163
+ uint32_t auxFlags);
164
+
165
+ cudnnStatus_t CUDNNWINAPI
166
+ cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
167
+ cudnnRNNAlgo_t *algo,
168
+ cudnnRNNMode_t *cellMode,
169
+ cudnnRNNBiasMode_t *biasMode,
170
+ cudnnDirectionMode_t *dirMode,
171
+ cudnnRNNInputMode_t *inputMode,
172
+ cudnnDataType_t *dataType,
173
+ cudnnDataType_t *mathPrec,
174
+ cudnnMathType_t *mathType,
175
+ int32_t *inputSize,
176
+ int32_t *hiddenSize,
177
+ int32_t *projSize,
178
+ int32_t *numLayers,
179
+ cudnnDropoutDescriptor_t *dropoutDesc,
180
+ uint32_t *auxFlags);
181
+
182
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
183
+ cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
184
+ cudnnRNNClipMode_t clipMode,
185
+ cudnnNanPropagation_t clipNanOpt,
186
+ double lclip,
187
+ double rclip);
188
+
189
+ cudnnStatus_t CUDNNWINAPI
190
+ cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
191
+
192
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
193
+ cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
194
+ cudnnRNNClipMode_t *clipMode,
195
+ cudnnNanPropagation_t *clipNanOpt,
196
+ double *lclip,
197
+ double *rclip);
198
+
199
+ cudnnStatus_t CUDNNWINAPI
200
+ cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
201
+
202
+ cudnnStatus_t CUDNNWINAPI
203
+ cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
204
+
205
+ cudnnStatus_t CUDNNWINAPI
206
+ cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
207
+ cudnnRNNDescriptor_t rnnDesc,
208
+ cudnnForwardMode_t fwdMode,
209
+ cudnnRNNDataDescriptor_t xDesc,
210
+ size_t *workSpaceSize,
211
+ size_t *reserveSpaceSize);
212
+
213
+ cudnnStatus_t CUDNNWINAPI
214
+ cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
215
+
216
+ cudnnStatus_t CUDNNWINAPI
217
+ cudnnGetRNNWeightParams(cudnnHandle_t handle,
218
+ cudnnRNNDescriptor_t rnnDesc,
219
+ int32_t pseudoLayer,
220
+ size_t weightSpaceSize,
221
+ const void *weightSpace,
222
+ int32_t linLayerID,
223
+ cudnnTensorDescriptor_t mDesc,
224
+ void **mAddr,
225
+ cudnnTensorDescriptor_t bDesc,
226
+ void **bAddr);
227
+
228
+ cudnnStatus_t CUDNNWINAPI
229
+ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
230
+
231
+ cudnnStatus_t CUDNNWINAPI
232
+ cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
233
+
234
+ cudnnStatus_t CUDNNWINAPI
235
+ cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
236
+ cudnnDataType_t dataType,
237
+ cudnnRNNDataLayout_t layout,
238
+ int maxSeqLength,
239
+ int batchSize,
240
+ int vectorSize,
241
+ const int seqLengthArray[], /* length of each sequence in the batch */
242
+ void *paddingFill); /* symbol for filling padding position in output */
243
+
244
+ cudnnStatus_t CUDNNWINAPI
245
+ cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
246
+ cudnnDataType_t *dataType,
247
+ cudnnRNNDataLayout_t *layout,
248
+ int *maxSeqLength,
249
+ int *batchSize,
250
+ int *vectorSize,
251
+ int arrayLengthRequested,
252
+ int seqLengthArray[],
253
+ void *paddingFill);
254
+
255
+ cudnnStatus_t CUDNNWINAPI
256
+ cudnnRNNForward(cudnnHandle_t handle,
257
+ cudnnRNNDescriptor_t rnnDesc,
258
+ cudnnForwardMode_t fwdMode,
259
+ const int32_t devSeqLengths[],
260
+ cudnnRNNDataDescriptor_t xDesc,
261
+ const void *x,
262
+ cudnnRNNDataDescriptor_t yDesc,
263
+ void *y,
264
+ cudnnTensorDescriptor_t hDesc,
265
+ const void *hx,
266
+ void *hy,
267
+ cudnnTensorDescriptor_t cDesc,
268
+ const void *cx,
269
+ void *cy,
270
+ size_t weightSpaceSize,
271
+ const void *weightSpace,
272
+ size_t workSpaceSize,
273
+ void *workSpace,
274
+ size_t reserveSpaceSize,
275
+ void *reserveSpace);
276
+
277
+ /* Sequence data descriptor */
278
+
279
+ typedef enum {
280
+ CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */
281
+ CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
282
+ CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */
283
+ CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */
284
+ } cudnnSeqDataAxis_t;
285
+
286
+ struct cudnnSeqDataStruct;
287
+ typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
288
+
289
+ #define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
290
+
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
293
+
294
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
295
+ cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
296
+
297
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
298
+ cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
299
+ cudnnDataType_t dataType,
300
+ int nbDims,
301
+ const int dimA[],
302
+ const cudnnSeqDataAxis_t axes[],
303
+ size_t seqLengthArraySize,
304
+ const int seqLengthArray[],
305
+ void *paddingFill);
306
+
307
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
308
+ cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
309
+ cudnnDataType_t *dataType,
310
+ int *nbDims,
311
+ int nbDimsRequested,
312
+ int dimA[],
313
+ cudnnSeqDataAxis_t axes[],
314
+ size_t *seqLengthArraySize,
315
+ size_t seqLengthSizeRequested,
316
+ int seqLengthArray[],
317
+ void *paddingFill);
318
+
319
+ /* Multihead Attention */
320
+
321
+ /*
322
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
323
+ * Use the bitwise OR operator to combine several settings listed below. Additional
324
+ * minor options can be added here w/o changing or introducing new API functions.
325
+ */
326
+ #define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */
327
+ #define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
328
+ #define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */
329
+ #define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */
330
+
331
+ struct cudnnAttnStruct;
332
+ typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
333
+
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
336
+
337
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
338
+ cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
339
+
340
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
341
+ cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
342
+ unsigned attnMode,
343
+ int nHeads,
344
+ double smScaler,
345
+ cudnnDataType_t dataType,
346
+ cudnnDataType_t computePrec,
347
+ cudnnMathType_t mathType,
348
+ cudnnDropoutDescriptor_t attnDropoutDesc,
349
+ cudnnDropoutDescriptor_t postDropoutDesc,
350
+ int qSize,
351
+ int kSize,
352
+ int vSize,
353
+ int qProjSize,
354
+ int kProjSize,
355
+ int vProjSize,
356
+ int oProjSize,
357
+ int qoMaxSeqLength,
358
+ int kvMaxSeqLength,
359
+ int maxBatchSize,
360
+ int maxBeamSize);
361
+
362
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
363
+ cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
364
+ unsigned *attnMode,
365
+ int *nHeads,
366
+ double *smScaler,
367
+ cudnnDataType_t *dataType,
368
+ cudnnDataType_t *computePrec,
369
+ cudnnMathType_t *mathType,
370
+ cudnnDropoutDescriptor_t *attnDropoutDesc,
371
+ cudnnDropoutDescriptor_t *postDropoutDesc,
372
+ int *qSize,
373
+ int *kSize,
374
+ int *vSize,
375
+ int *qProjSize,
376
+ int *kProjSize,
377
+ int *vProjSize,
378
+ int *oProjSize,
379
+ int *qoMaxSeqLength,
380
+ int *kvMaxSeqLength,
381
+ int *maxBatchSize,
382
+ int *maxBeamSize);
383
+
384
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
385
+ cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
386
+ const cudnnAttnDescriptor_t attnDesc,
387
+ size_t *weightSizeInBytes,
388
+ size_t *workSpaceSizeInBytes,
389
+ size_t *reserveSpaceSizeInBytes);
390
+
391
+ typedef enum {
392
+ CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
393
+ CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
394
+ CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
395
+ CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
396
+ CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */
397
+ CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */
398
+ CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */
399
+ CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */
400
+ } cudnnMultiHeadAttnWeightKind_t;
401
+
402
+ #define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
403
+
404
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
405
+ cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
406
+ const cudnnAttnDescriptor_t attnDesc,
407
+ cudnnMultiHeadAttnWeightKind_t wKind,
408
+ size_t weightSizeInBytes,
409
+ const void *weights,
410
+ cudnnTensorDescriptor_t wDesc,
411
+ void **wAddr);
412
+
413
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
414
+ cudnnMultiHeadAttnForward(cudnnHandle_t handle,
415
+ const cudnnAttnDescriptor_t attnDesc,
416
+ int currIdx,
417
+ const int loWinIdx[],
418
+ const int hiWinIdx[],
419
+ const int devSeqLengthsQO[],
420
+ const int devSeqLengthsKV[],
421
+ const cudnnSeqDataDescriptor_t qDesc,
422
+ const void *queries,
423
+ const void *residuals,
424
+ const cudnnSeqDataDescriptor_t kDesc,
425
+ const void *keys,
426
+ const cudnnSeqDataDescriptor_t vDesc,
427
+ const void *values,
428
+ const cudnnSeqDataDescriptor_t oDesc,
429
+ void *out,
430
+ size_t weightSizeInBytes,
431
+ const void *weights,
432
+ size_t workSpaceSizeInBytes,
433
+ void *workSpace,
434
+ size_t reserveSpaceSizeInBytes,
435
+ void *reserveSpace);
436
+
437
+ /*
438
+ * \brief Cross-library version checker.
439
+ * This function is implemented differently in each sub-library. Each sublib
440
+ * checks whether its own version matches that of its dependencies.
441
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
442
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
443
+ */
444
+ cudnnStatus_t CUDNNWINAPI
445
+ cudnnAdvVersionCheck(void);
446
+
447
+ typedef enum {
448
+ CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
449
+ CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
450
+ } cudnnWgradMode_t;
451
+
452
+ cudnnStatus_t CUDNNWINAPI
453
+ cudnnRNNBackwardData_v8(cudnnHandle_t handle,
454
+ cudnnRNNDescriptor_t rnnDesc,
455
+ const int32_t devSeqLengths[],
456
+ cudnnRNNDataDescriptor_t yDesc,
457
+ const void *y,
458
+ const void *dy,
459
+ cudnnRNNDataDescriptor_t xDesc,
460
+ void *dx,
461
+ cudnnTensorDescriptor_t hDesc,
462
+ const void *hx,
463
+ const void *dhy,
464
+ void *dhx,
465
+ cudnnTensorDescriptor_t cDesc,
466
+ const void *cx,
467
+ const void *dcy,
468
+ void *dcx,
469
+ size_t weightSpaceSize,
470
+ const void *weightSpace,
471
+ size_t workSpaceSize,
472
+ void *workSpace,
473
+ size_t reserveSpaceSize,
474
+ void *reserveSpace);
475
+
476
+ cudnnStatus_t CUDNNWINAPI
477
+ cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
478
+ cudnnRNNDescriptor_t rnnDesc,
479
+ cudnnWgradMode_t addGrad,
480
+ const int32_t devSeqLengths[],
481
+ cudnnRNNDataDescriptor_t xDesc,
482
+ const void *x,
483
+ cudnnTensorDescriptor_t hDesc,
484
+ const void *hx,
485
+ cudnnRNNDataDescriptor_t yDesc,
486
+ const void *y,
487
+ size_t weightSpaceSize,
488
+ void *dweightSpace,
489
+ size_t workSpaceSize,
490
+ void *workSpace,
491
+ size_t reserveSpaceSize,
492
+ void *reserveSpace);
493
+
494
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
495
+ cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
496
+ const cudnnAttnDescriptor_t attnDesc,
497
+ const int loWinIdx[],
498
+ const int hiWinIdx[],
499
+ const int devSeqLengthsDQDO[],
500
+ const int devSeqLengthsDKDV[],
501
+ const cudnnSeqDataDescriptor_t doDesc,
502
+ const void *dout,
503
+ const cudnnSeqDataDescriptor_t dqDesc,
504
+ void *dqueries,
505
+ const void *queries,
506
+ const cudnnSeqDataDescriptor_t dkDesc,
507
+ void *dkeys,
508
+ const void *keys,
509
+ const cudnnSeqDataDescriptor_t dvDesc,
510
+ void *dvalues,
511
+ const void *values,
512
+ size_t weightSizeInBytes,
513
+ const void *weights,
514
+ size_t workSpaceSizeInBytes,
515
+ void *workSpace,
516
+ size_t reserveSpaceSizeInBytes,
517
+ void *reserveSpace);
518
+
519
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
520
+ cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
521
+ const cudnnAttnDescriptor_t attnDesc,
522
+ cudnnWgradMode_t addGrad,
523
+ const cudnnSeqDataDescriptor_t qDesc,
524
+ const void *queries,
525
+ const cudnnSeqDataDescriptor_t kDesc,
526
+ const void *keys,
527
+ const cudnnSeqDataDescriptor_t vDesc,
528
+ const void *values,
529
+ const cudnnSeqDataDescriptor_t doDesc,
530
+ const void *dout,
531
+ size_t weightSizeInBytes,
532
+ const void *weights,
533
+ void *dweights,
534
+ size_t workSpaceSizeInBytes,
535
+ void *workSpace,
536
+ size_t reserveSpaceSizeInBytes,
537
+ void *reserveSpace);
538
+
539
+ /*
540
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
541
+ */
542
+ /* Input normalization mode for loss function */
543
+ typedef enum {
544
+ CUDNN_LOSS_NORMALIZATION_NONE = 0,
545
+ CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
546
+ } cudnnLossNormalizationMode_t;
547
+
548
+ cudnnStatus_t CUDNNWINAPI
549
+ cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
550
+
551
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
552
+ cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
553
+
554
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
555
+ cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
556
+ cudnnDataType_t compType,
557
+ cudnnLossNormalizationMode_t normMode,
558
+ cudnnNanPropagation_t gradMode);
559
+
560
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
561
+ cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
562
+ cudnnDataType_t compType,
563
+ cudnnLossNormalizationMode_t normMode,
564
+ cudnnNanPropagation_t gradMode,
565
+ int maxLabelLength);
566
+
567
+ cudnnStatus_t CUDNNWINAPI
568
+ cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
569
+ cudnnDataType_t compType,
570
+ cudnnLossNormalizationMode_t normMode,
571
+ cudnnCTCGradMode_t ctcGradMode,
572
+ int maxLabelLength);
573
+
574
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
575
+ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
576
+
577
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
578
+ cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
579
+ cudnnDataType_t *compType,
580
+ cudnnLossNormalizationMode_t *normMode,
581
+ cudnnNanPropagation_t *gradMode);
582
+
583
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
584
+ cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
585
+ cudnnDataType_t *compType,
586
+ cudnnLossNormalizationMode_t *normMode,
587
+ cudnnNanPropagation_t *gradMode,
588
+ int *maxLabelLength);
589
+
590
+ cudnnStatus_t CUDNNWINAPI
591
+ cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
592
+ cudnnDataType_t *compType,
593
+ cudnnLossNormalizationMode_t *normMode,
594
+ cudnnCTCGradMode_t *ctcGradMode,
595
+ int *maxLabelLength);
596
+
597
+ cudnnStatus_t CUDNNWINAPI
598
+ cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
599
+
600
+ /* return the ctc costs and gradients, given the probabilities and labels */
601
+ cudnnStatus_t CUDNNWINAPI
602
+ cudnnCTCLoss(
603
+ cudnnHandle_t handle,
604
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
605
+ timing steps, N is the mini batch size, A is the alphabet size) */
606
+ const void *probs, /* probabilities after softmax, in GPU memory */
607
+ const int hostLabels[], /* labels, in CPU memory */
608
+ const int hostLabelLengths[], /* the length of each label, in CPU memory */
609
+ const int hostInputLengths[], /* the lengths of timing steps in each batch, in CPU memory */
610
+ void *costs, /* the returned costs of CTC, in GPU memory */
611
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
612
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
613
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
614
+ cudnnCTCLossDescriptor_t ctcLossDesc,
615
+ void *workspace, /* pointer to the workspace, in GPU memory */
616
+ size_t workSpaceSizeInBytes); /* size of the workspace */
617
+
618
+ /* return the ctc costs and gradients, given the probabilities and labels */
619
+ cudnnStatus_t CUDNNWINAPI
620
+ cudnnCTCLoss_v8(
621
+ cudnnHandle_t handle,
622
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
623
+ cudnnCTCLossDescriptor_t ctcLossDesc,
624
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
625
+ timing steps, N is the mini batch size, A is the alphabet size) */
626
+ const void *probs, /* probabilities after softmax, in GPU memory */
627
+ const int labels[], /* labels, in GPU memory */
628
+ const int labelLengths[], /* the length of each label, in GPU memory */
629
+ const int inputLengths[], /* the lengths of timing steps in each batch, in GPU memory */
630
+ void *costs, /* the returned costs of CTC, in GPU memory */
631
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
632
+ void *gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
633
+ size_t workSpaceSizeInBytes, /* size of the workspace */
634
+ void *workspace); /* pointer to the workspace, in GPU memory */
635
+
636
+ /* return the workspace size needed for ctc */
637
+ cudnnStatus_t CUDNNWINAPI
638
+ cudnnGetCTCLossWorkspaceSize(
639
+ cudnnHandle_t handle,
640
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
641
+ timing steps, N is the mini batch size, A is the alphabet size) */
642
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
643
+ dimensions are T,N,A. To compute costs
644
+ only, set it to NULL */
645
+ const int *labels, /* labels, in CPU memory */
646
+ const int *labelLengths, /* the length of each label, in CPU memory */
647
+ const int *inputLengths, /* the lengths of timing steps in each batch, in CPU memory */
648
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
649
+ cudnnCTCLossDescriptor_t ctcLossDesc,
650
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
651
+
652
+ /* return the workspace size needed for ctc */
653
+ cudnnStatus_t CUDNNWINAPI
654
+ cudnnGetCTCLossWorkspaceSize_v8(
655
+ cudnnHandle_t handle,
656
+ cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
657
+ cudnnCTCLossDescriptor_t ctcLossDesc,
658
+ const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
659
+ timing steps, N is the mini batch size, A is the alphabet size) */
660
+ const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
661
+ dimensions are T,N,A. To compute costs
662
+ only, set it to NULL */
663
+ size_t *sizeInBytes); /* pointer to the returned workspace size */
664
+
665
+ #if defined(__cplusplus)
666
+ }
667
+ #endif
668
+
669
+ #endif /* CUDNN_ADV_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDNN_BACKEND_H_
51
+ #define _CUDNN_BACKEND_H_
52
+
53
+ /*
54
+ * The content of this header has been moved into cudnn_graph.h.
55
+ * This header is kept for the backward compatibility purpose.
56
+ */
57
+
58
+ #include "cudnn_graph.h"
59
+
60
+ #endif /* _CUDNN_BACKEND_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDNN_BACKEND_H_
51
+ #define _CUDNN_BACKEND_H_
52
+
53
+ /*
54
+ * The content of this header has been moved into cudnn_graph.h.
55
+ * This header is kept for the backward compatibility purpose.
56
+ */
57
+
58
+ #include "cudnn_graph.h"
59
+
60
+ #endif /* _CUDNN_BACKEND_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_cnn : cuDNN's basic definitions and CNN functions.
52
+ */
53
+
54
+ #if !defined(CUDNN_CNN_H_)
55
+ #define CUDNN_CNN_H_
56
+
57
+ #pragma once
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+ #include "cudnn_ops.h"
62
+
63
+ /* These version numbers are autogenerated, do not edit manually. */
64
+ #define CUDNN_CNN_MAJOR 9
65
+ #define CUDNN_CNN_MINOR 10
66
+ #define CUDNN_CNN_PATCH 2
67
+
68
+ #if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL)
69
+ #error Version mismatch in cuDNN CNN INFER!!!
70
+ #endif
71
+
72
+ #if defined(__cplusplus)
73
+ extern "C" {
74
+ #endif
75
+
76
+ typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED;
77
+
78
+ typedef struct cudnnConvolutionFwdAlgoPerfStruct {
79
+ cudnnConvolutionFwdAlgo_t algo;
80
+ cudnnStatus_t status;
81
+ float time;
82
+ size_t memory;
83
+ cudnnDeterminism_t determinism;
84
+ cudnnMathType_t mathType;
85
+ int reserved[3];
86
+ } cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED;
87
+
88
+ /* Create an instance of convolution descriptor */
89
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
90
+ cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
91
+
92
+ /* Destroy an instance of convolution descriptor */
93
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
94
+ cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
95
+
96
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
97
+ cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
98
+
99
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
100
+ cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
101
+
102
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
103
+ cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
104
+
105
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
106
+ cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
107
+
108
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
109
+ cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
110
+
111
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
112
+ cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
113
+
114
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
115
+ cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
116
+ int pad_h, /* zero-padding height */
117
+ int pad_w, /* zero-padding width */
118
+ int u, /* vertical filter stride */
119
+ int v, /* horizontal filter stride */
120
+ int dilation_h, /* filter dilation in the vertical dimension */
121
+ int dilation_w, /* filter dilation in the horizontal dimension */
122
+ cudnnConvolutionMode_t mode,
123
+ cudnnDataType_t computeType);
124
+
125
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
126
+ cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
127
+ int *pad_h, /* zero-padding height */
128
+ int *pad_w, /* zero-padding width */
129
+ int *u, /* vertical filter stride */
130
+ int *v, /* horizontal filter stride */
131
+ int *dilation_h, /* filter dilation in the vertical dimension */
132
+ int *dilation_w, /* filter dilation in the horizontal dimension */
133
+ cudnnConvolutionMode_t *mode,
134
+ cudnnDataType_t *computeType);
135
+
136
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
137
+ cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
138
+ int arrayLength, /* nbDims-2 size */
139
+ const int padA[],
140
+ const int filterStrideA[],
141
+ const int dilationA[],
142
+ cudnnConvolutionMode_t mode,
143
+ cudnnDataType_t computeType); /* convolution data type */
144
+
145
+ /* Helper function to return the dimensions of the output tensor given a convolution descriptor */
146
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
147
+ cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
148
+ int arrayLengthRequested,
149
+ int *arrayLength,
150
+ int padA[],
151
+ int strideA[],
152
+ int dilationA[],
153
+ cudnnConvolutionMode_t *mode,
154
+ cudnnDataType_t *computeType); /* convolution data type */
155
+
156
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
157
+ cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
158
+ const cudnnTensorDescriptor_t inputTensorDesc,
159
+ const cudnnFilterDescriptor_t filterDesc,
160
+ int *n,
161
+ int *c,
162
+ int *h,
163
+ int *w);
164
+
165
+ /* Helper function to return the dimensions of the output tensor given a convolution descriptor */
166
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
167
+ cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
168
+ const cudnnTensorDescriptor_t inputTensorDesc,
169
+ const cudnnFilterDescriptor_t filterDesc,
170
+ int nbDims,
171
+ int tensorOuputDimA[]);
172
+
173
+ /* helper function to provide the convolution forward algo that fit best the requirement */
174
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
175
+ cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
176
+
177
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
178
+ cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
179
+ const cudnnTensorDescriptor_t srcDesc,
180
+ const cudnnFilterDescriptor_t filterDesc,
181
+ const cudnnConvolutionDescriptor_t convDesc,
182
+ const cudnnTensorDescriptor_t destDesc,
183
+ const int requestedAlgoCount,
184
+ int *returnedAlgoCount,
185
+ cudnnConvolutionFwdAlgoPerf_t *perfResults);
186
+
187
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
188
+ cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
189
+ const cudnnTensorDescriptor_t xDesc,
190
+ const cudnnFilterDescriptor_t wDesc,
191
+ const cudnnConvolutionDescriptor_t convDesc,
192
+ const cudnnTensorDescriptor_t yDesc,
193
+ const int requestedAlgoCount,
194
+ int *returnedAlgoCount,
195
+ cudnnConvolutionFwdAlgoPerf_t *perfResults);
196
+
197
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
198
+ cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
199
+ const cudnnTensorDescriptor_t xDesc,
200
+ const void *x,
201
+ const cudnnFilterDescriptor_t wDesc,
202
+ const void *w,
203
+ const cudnnConvolutionDescriptor_t convDesc,
204
+ const cudnnTensorDescriptor_t yDesc,
205
+ void *y,
206
+ const int requestedAlgoCount,
207
+ int *returnedAlgoCount,
208
+ cudnnConvolutionFwdAlgoPerf_t *perfResults,
209
+ void *workSpace,
210
+ size_t workSpaceSizeInBytes);
211
+
212
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
213
+ cudnnIm2Col(cudnnHandle_t handle,
214
+ const cudnnTensorDescriptor_t xDesc,
215
+ const void *x,
216
+ const cudnnFilterDescriptor_t wDesc,
217
+ const cudnnConvolutionDescriptor_t convDesc,
218
+ void *colBuffer);
219
+
220
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
221
+ cudnnReorderFilterAndBias(cudnnHandle_t handle,
222
+ const cudnnFilterDescriptor_t filterDesc,
223
+ cudnnReorderType_t reorderType,
224
+ const void *filterData,
225
+ void *reorderedFilterData,
226
+ int reorderBias,
227
+ const void *biasData,
228
+ void *reorderedBiasData);
229
+
230
+ /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
231
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
232
+ cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
233
+ const cudnnTensorDescriptor_t xDesc,
234
+ const cudnnFilterDescriptor_t wDesc,
235
+ const cudnnConvolutionDescriptor_t convDesc,
236
+ const cudnnTensorDescriptor_t yDesc,
237
+ cudnnConvolutionFwdAlgo_t algo,
238
+ size_t *sizeInBytes);
239
+
240
+ /* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
241
+
242
+ /* Function to perform the forward pass for batch convolution */
243
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
244
+ cudnnConvolutionForward(cudnnHandle_t handle,
245
+ const void *alpha,
246
+ const cudnnTensorDescriptor_t xDesc,
247
+ const void *x,
248
+ const cudnnFilterDescriptor_t wDesc,
249
+ const void *w,
250
+ const cudnnConvolutionDescriptor_t convDesc,
251
+ cudnnConvolutionFwdAlgo_t algo,
252
+ void *workSpace,
253
+ size_t workSpaceSizeInBytes,
254
+ const void *beta,
255
+ const cudnnTensorDescriptor_t yDesc,
256
+ void *y);
257
+
258
+ /* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
259
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
260
+ cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
261
+ const void *alpha1,
262
+ const cudnnTensorDescriptor_t xDesc,
263
+ const void *x,
264
+ const cudnnFilterDescriptor_t wDesc,
265
+ const void *w,
266
+ const cudnnConvolutionDescriptor_t convDesc,
267
+ cudnnConvolutionFwdAlgo_t algo,
268
+ void *workSpace,
269
+ size_t workSpaceSizeInBytes,
270
+ const void *alpha2,
271
+ const cudnnTensorDescriptor_t zDesc,
272
+ const void *z,
273
+ const cudnnTensorDescriptor_t biasDesc,
274
+ const void *bias,
275
+ const cudnnActivationDescriptor_t activationDesc,
276
+ const cudnnTensorDescriptor_t yDesc,
277
+ void *y);
278
+
279
+ /* helper function to provide the convolution backward data algo that fit best the requirement */
280
+
281
+ typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
282
+ cudnnConvolutionBwdDataAlgo_t algo;
283
+ cudnnStatus_t status;
284
+ float time;
285
+ size_t memory;
286
+ cudnnDeterminism_t determinism;
287
+ cudnnMathType_t mathType;
288
+ int reserved[3];
289
+ } cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED;
290
+
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
293
+
294
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
295
+ cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
296
+ const cudnnFilterDescriptor_t wDesc,
297
+ const cudnnTensorDescriptor_t dyDesc,
298
+ const cudnnConvolutionDescriptor_t convDesc,
299
+ const cudnnTensorDescriptor_t dxDesc,
300
+ const int requestedAlgoCount,
301
+ int *returnedAlgoCount,
302
+ cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
303
+
304
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
305
+ cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
306
+ const cudnnFilterDescriptor_t wDesc,
307
+ const void *w,
308
+ const cudnnTensorDescriptor_t dyDesc,
309
+ const void *dy,
310
+ const cudnnConvolutionDescriptor_t convDesc,
311
+ const cudnnTensorDescriptor_t dxDesc,
312
+ void *dx,
313
+ const int requestedAlgoCount,
314
+ int *returnedAlgoCount,
315
+ cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
316
+ void *workSpace,
317
+ size_t workSpaceSizeInBytes);
318
+
319
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
320
+ cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
321
+ const cudnnFilterDescriptor_t filterDesc,
322
+ const cudnnTensorDescriptor_t diffDesc,
323
+ const cudnnConvolutionDescriptor_t convDesc,
324
+ const cudnnTensorDescriptor_t gradDesc,
325
+ const int requestedAlgoCount,
326
+ int *returnedAlgoCount,
327
+ cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
328
+
329
+ /*
330
+ * convolution algorithm (which requires potentially some workspace)
331
+ */
332
+
333
+ /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
336
+ const cudnnFilterDescriptor_t wDesc,
337
+ const cudnnTensorDescriptor_t dyDesc,
338
+ const cudnnConvolutionDescriptor_t convDesc,
339
+ const cudnnTensorDescriptor_t dxDesc,
340
+ cudnnConvolutionBwdDataAlgo_t algo,
341
+ size_t *sizeInBytes);
342
+
343
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
344
+ cudnnConvolutionBackwardData(cudnnHandle_t handle,
345
+ const void *alpha,
346
+ const cudnnFilterDescriptor_t wDesc,
347
+ const void *w,
348
+ const cudnnTensorDescriptor_t dyDesc,
349
+ const void *dy,
350
+ const cudnnConvolutionDescriptor_t convDesc,
351
+ cudnnConvolutionBwdDataAlgo_t algo,
352
+ void *workSpace,
353
+ size_t workSpaceSizeInBytes,
354
+ const void *beta,
355
+ const cudnnTensorDescriptor_t dxDesc,
356
+ void *dx);
357
+
358
+ /* Helper function to calculate folding descriptors for dgrad */
359
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
360
+ cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
361
+ const cudnnFilterDescriptor_t filterDesc,
362
+ const cudnnTensorDescriptor_t diffDesc,
363
+ const cudnnConvolutionDescriptor_t convDesc,
364
+ const cudnnTensorDescriptor_t gradDesc,
365
+ const cudnnTensorFormat_t transformFormat,
366
+ cudnnFilterDescriptor_t foldedFilterDesc,
367
+ cudnnTensorDescriptor_t paddedDiffDesc,
368
+ cudnnConvolutionDescriptor_t foldedConvDesc,
369
+ cudnnTensorDescriptor_t foldedGradDesc,
370
+ cudnnTensorTransformDescriptor_t filterFoldTransDesc,
371
+ cudnnTensorTransformDescriptor_t diffPadTransDesc,
372
+ cudnnTensorTransformDescriptor_t gradFoldTransDesc,
373
+ cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
374
+
375
+ /* cudnnFusedOps... */
376
+ struct cudnnFusedOpsConstParamStruct;
377
+ typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED;
378
+
379
+ struct cudnnFusedOpsVariantParamStruct;
380
+ typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED;
381
+
382
+ struct cudnnFusedOpsPlanStruct;
383
+ typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED;
384
+
385
+ typedef enum {
386
+ /* each op in [ ] can be disabled by passing NULL ptr */
387
+ /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
388
+ CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
389
+ /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
390
+ CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
391
+ /* utility for BN training in BN-conv fusion */
392
+ /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
393
+ /* optionally update running stats and generate saved stats */
394
+ CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
395
+ /* utility for BN inference in BN-conv fusion */
396
+ /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
397
+ CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
398
+ /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
399
+ CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
400
+ /* reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask */
401
+ CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
402
+ /* reserved for future use */
403
+ CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
404
+ } cudnnFusedOps_t CUDNN_DEPRECATED;
405
+
406
+ typedef enum {
407
+ /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
408
+ /* get XDESC: pass previously created cudnnTensorDescriptor_t */
409
+ CUDNN_PARAM_XDESC = 0,
410
+ /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
411
+ CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
412
+ /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
413
+ CUDNN_PARAM_BN_MODE = 2,
414
+ /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
415
+ /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
416
+ CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
417
+ /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
418
+ CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
419
+ /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
420
+ CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
421
+ /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
422
+ /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
423
+ CUDNN_PARAM_ACTIVATION_DESC = 6,
424
+ /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
425
+ /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
426
+ CUDNN_PARAM_CONV_DESC = 7,
427
+ /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
428
+ /* get WDESC: pass previously created cudnnFilterDescriptor_t */
429
+ CUDNN_PARAM_WDESC = 8,
430
+ /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
431
+ CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
432
+ /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
433
+ /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
434
+ CUDNN_PARAM_DWDESC = 10,
435
+ /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
436
+ CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
437
+ /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
438
+ /* get YDESC: pass previously created cudnnTensorDescriptor_t */
439
+ CUDNN_PARAM_YDESC = 12,
440
+ /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
441
+ CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
442
+ /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
443
+ /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
444
+ CUDNN_PARAM_DYDESC = 14,
445
+ /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
446
+ CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
447
+ /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
448
+ /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
449
+ CUDNN_PARAM_YSTATS_DESC = 16,
450
+ /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
451
+ CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
452
+ /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
453
+ CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
454
+ /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
455
+ /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
456
+ CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
457
+ /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
458
+ CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
459
+ /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
460
+ CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
461
+ /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
462
+ CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
463
+ /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
464
+ CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
465
+ /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
466
+ CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
467
+ /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
468
+ CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
469
+
470
+ /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
471
+ /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
472
+ CUDNN_PARAM_ZDESC = 26,
473
+ /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
474
+ CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
475
+ /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
476
+ /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
477
+ CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
478
+ /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
479
+ CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
480
+ /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
481
+ CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
482
+
483
+ /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
484
+ /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
485
+ CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
486
+ /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
487
+ CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
488
+
489
+ /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
490
+ /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
491
+ CUDNN_PARAM_DXDESC = 33,
492
+ /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
493
+ CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
494
+ /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
495
+ /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
496
+ CUDNN_PARAM_DZDESC = 35,
497
+ /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
498
+ CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
499
+ /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
500
+ CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
501
+ /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
502
+ CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
503
+ } cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED;
504
+
505
+ typedef enum {
506
+ CUDNN_PTR_NULL = 0,
507
+ CUDNN_PTR_ELEM_ALIGNED = 1,
508
+ CUDNN_PTR_16B_ALIGNED = 2,
509
+ } cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED;
510
+
511
+ typedef enum {
512
+ /* set: pass void* pointing to dev memory */
513
+ /* get: pass void** pointing to host memory */
514
+ CUDNN_PTR_XDATA = 0,
515
+ CUDNN_PTR_BN_EQSCALE = 1,
516
+ CUDNN_PTR_BN_EQBIAS = 2,
517
+ CUDNN_PTR_WDATA = 3,
518
+ CUDNN_PTR_DWDATA = 4,
519
+ CUDNN_PTR_YDATA = 5,
520
+ CUDNN_PTR_DYDATA = 6,
521
+ CUDNN_PTR_YSUM = 7,
522
+ CUDNN_PTR_YSQSUM = 8,
523
+ CUDNN_PTR_WORKSPACE = 9,
524
+ CUDNN_PTR_BN_SCALE = 10,
525
+ CUDNN_PTR_BN_BIAS = 11,
526
+ CUDNN_PTR_BN_SAVED_MEAN = 12,
527
+ CUDNN_PTR_BN_SAVED_INVSTD = 13,
528
+ CUDNN_PTR_BN_RUNNING_MEAN = 14,
529
+ CUDNN_PTR_BN_RUNNING_VAR = 15,
530
+ CUDNN_PTR_ZDATA = 16,
531
+ CUDNN_PTR_BN_Z_EQSCALE = 17,
532
+ CUDNN_PTR_BN_Z_EQBIAS = 18,
533
+ CUDNN_PTR_ACTIVATION_BITMASK = 19,
534
+ CUDNN_PTR_DXDATA = 20,
535
+ CUDNN_PTR_DZDATA = 21,
536
+ CUDNN_PTR_BN_DSCALE = 22,
537
+ CUDNN_PTR_BN_DBIAS = 23,
538
+
539
+ /* set/get: pass size_t* pointing to host memory */
540
+ CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
541
+ /* set/get: pass int64_t* pointing to host memory */
542
+ CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
543
+ /* set/get: pass double* pointing to host memory */
544
+ CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
545
+ /* set/get: pass double* pointing to host memory */
546
+ CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
547
+ } cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED;
548
+
549
+ cudnnStatus_t CUDNNWINAPI
550
+ cudnnCnnVersionCheck(void);
551
+
552
+ /* helper function to provide the convolution backward filter algo that fit best the requirement */
553
+
554
+ typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
555
+ cudnnConvolutionBwdFilterAlgo_t algo;
556
+ cudnnStatus_t status;
557
+ float time;
558
+ size_t memory;
559
+ cudnnDeterminism_t determinism;
560
+ cudnnMathType_t mathType;
561
+ int reserved[3];
562
+ } cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED;
563
+
564
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
565
+ cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
566
+
567
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
568
+ cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
569
+ const cudnnTensorDescriptor_t xDesc,
570
+ const cudnnTensorDescriptor_t dyDesc,
571
+ const cudnnConvolutionDescriptor_t convDesc,
572
+ const cudnnFilterDescriptor_t dwDesc,
573
+ const int requestedAlgoCount,
574
+ int *returnedAlgoCount,
575
+ cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
576
+
577
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
578
+ cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
579
+ const cudnnTensorDescriptor_t xDesc,
580
+ const void *x,
581
+ const cudnnTensorDescriptor_t dyDesc,
582
+ const void *y,
583
+ const cudnnConvolutionDescriptor_t convDesc,
584
+ const cudnnFilterDescriptor_t dwDesc,
585
+ void *dw,
586
+ const int requestedAlgoCount,
587
+ int *returnedAlgoCount,
588
+ cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
589
+ void *workSpace,
590
+ size_t workSpaceSizeInBytes);
591
+
592
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
593
+ cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
594
+ const cudnnTensorDescriptor_t srcDesc,
595
+ const cudnnTensorDescriptor_t diffDesc,
596
+ const cudnnConvolutionDescriptor_t convDesc,
597
+ const cudnnFilterDescriptor_t gradDesc,
598
+ const int requestedAlgoCount,
599
+ int *returnedAlgoCount,
600
+ cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
601
+
602
+ /*
603
+ * convolution algorithm (which requires potentially some workspace)
604
+ */
605
+
606
+ /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
607
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
608
+ cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
609
+ const cudnnTensorDescriptor_t xDesc,
610
+ const cudnnTensorDescriptor_t dyDesc,
611
+ const cudnnConvolutionDescriptor_t convDesc,
612
+ const cudnnFilterDescriptor_t gradDesc,
613
+ cudnnConvolutionBwdFilterAlgo_t algo,
614
+ size_t *sizeInBytes);
615
+
616
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
617
+ cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
618
+ const void *alpha,
619
+ const cudnnTensorDescriptor_t xDesc,
620
+ const void *x,
621
+ const cudnnTensorDescriptor_t dyDesc,
622
+ const void *dy,
623
+ const cudnnConvolutionDescriptor_t convDesc,
624
+ cudnnConvolutionBwdFilterAlgo_t algo,
625
+ void *workSpace,
626
+ size_t workSpaceSizeInBytes,
627
+ const void *beta,
628
+ const cudnnFilterDescriptor_t dwDesc,
629
+ void *dw);
630
+
631
+ /* Function to compute the bias gradient for batch convolution */
632
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
633
+ cudnnConvolutionBackwardBias(cudnnHandle_t handle,
634
+ const void *alpha,
635
+ const cudnnTensorDescriptor_t dyDesc,
636
+ const void *dy,
637
+ const void *beta,
638
+ const cudnnTensorDescriptor_t dbDesc,
639
+ void *db);
640
+
641
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
642
+ cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
643
+
644
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
645
+ cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
646
+
647
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
648
+ cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
649
+ cudnnFusedOpsConstParamLabel_t paramLabel,
650
+ const void *param);
651
+
652
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
653
+ cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
654
+ cudnnFusedOpsConstParamLabel_t paramLabel,
655
+ void *param,
656
+ int *isNULL);
657
+
658
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
659
+ cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
660
+
661
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
662
+ cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
663
+
664
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
665
+ cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
666
+ cudnnFusedOpsVariantParamLabel_t paramLabel,
667
+ void *ptr);
668
+
669
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
670
+ cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
671
+ cudnnFusedOpsVariantParamLabel_t paramLabel,
672
+ void *ptr);
673
+
674
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
675
+ cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
676
+
677
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
678
+ cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
679
+
680
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
681
+ cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
682
+ cudnnFusedOpsPlan_t plan,
683
+ const cudnnFusedOpsConstParamPack_t constPack,
684
+ size_t *workspaceSizeInBytes);
685
+
686
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
687
+ cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
688
+
689
+ #if defined(__cplusplus)
690
+ }
691
+ #endif
692
+
693
+ #endif /* CUDNN_CNN_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_cnn : cuDNN's basic definitions and CNN functions.
52
+ */
53
+
54
+ #if !defined(CUDNN_CNN_H_)
55
+ #define CUDNN_CNN_H_
56
+
57
+ #pragma once
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+ #include "cudnn_ops.h"
62
+
63
+ /* These version numbers are autogenerated, do not edit manually. */
64
+ #define CUDNN_CNN_MAJOR 9
65
+ #define CUDNN_CNN_MINOR 10
66
+ #define CUDNN_CNN_PATCH 2
67
+
68
+ #if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL)
69
+ #error Version mismatch in cuDNN CNN INFER!!!
70
+ #endif
71
+
72
+ #if defined(__cplusplus)
73
+ extern "C" {
74
+ #endif
75
+
76
+ typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED;
77
+
78
+ typedef struct cudnnConvolutionFwdAlgoPerfStruct {
79
+ cudnnConvolutionFwdAlgo_t algo;
80
+ cudnnStatus_t status;
81
+ float time;
82
+ size_t memory;
83
+ cudnnDeterminism_t determinism;
84
+ cudnnMathType_t mathType;
85
+ int reserved[3];
86
+ } cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED;
87
+
88
+ /* Create an instance of convolution descriptor */
89
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
90
+ cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
91
+
92
+ /* Destroy an instance of convolution descriptor */
93
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
94
+ cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
95
+
96
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
97
+ cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
98
+
99
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
100
+ cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
101
+
102
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
103
+ cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
104
+
105
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
106
+ cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
107
+
108
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
109
+ cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
110
+
111
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
112
+ cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
113
+
114
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
115
+ cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
116
+ int pad_h, /* zero-padding height */
117
+ int pad_w, /* zero-padding width */
118
+ int u, /* vertical filter stride */
119
+ int v, /* horizontal filter stride */
120
+ int dilation_h, /* filter dilation in the vertical dimension */
121
+ int dilation_w, /* filter dilation in the horizontal dimension */
122
+ cudnnConvolutionMode_t mode,
123
+ cudnnDataType_t computeType);
124
+
125
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
126
+ cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
127
+ int *pad_h, /* zero-padding height */
128
+ int *pad_w, /* zero-padding width */
129
+ int *u, /* vertical filter stride */
130
+ int *v, /* horizontal filter stride */
131
+ int *dilation_h, /* filter dilation in the vertical dimension */
132
+ int *dilation_w, /* filter dilation in the horizontal dimension */
133
+ cudnnConvolutionMode_t *mode,
134
+ cudnnDataType_t *computeType);
135
+
136
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
137
+ cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
138
+ int arrayLength, /* nbDims-2 size */
139
+ const int padA[],
140
+ const int filterStrideA[],
141
+ const int dilationA[],
142
+ cudnnConvolutionMode_t mode,
143
+ cudnnDataType_t computeType); /* convolution data type */
144
+
145
+ /* Helper function to return the dimensions of the output tensor given a convolution descriptor */
146
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
147
+ cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
148
+ int arrayLengthRequested,
149
+ int *arrayLength,
150
+ int padA[],
151
+ int strideA[],
152
+ int dilationA[],
153
+ cudnnConvolutionMode_t *mode,
154
+ cudnnDataType_t *computeType); /* convolution data type */
155
+
156
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
157
+ cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
158
+ const cudnnTensorDescriptor_t inputTensorDesc,
159
+ const cudnnFilterDescriptor_t filterDesc,
160
+ int *n,
161
+ int *c,
162
+ int *h,
163
+ int *w);
164
+
165
+ /* Helper function to return the dimensions of the output tensor given a convolution descriptor */
166
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
167
+ cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
168
+ const cudnnTensorDescriptor_t inputTensorDesc,
169
+ const cudnnFilterDescriptor_t filterDesc,
170
+ int nbDims,
171
+ int tensorOuputDimA[]);
172
+
173
+ /* helper function to provide the convolution forward algo that fit best the requirement */
174
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
175
+ cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
176
+
177
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
178
+ cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
179
+ const cudnnTensorDescriptor_t srcDesc,
180
+ const cudnnFilterDescriptor_t filterDesc,
181
+ const cudnnConvolutionDescriptor_t convDesc,
182
+ const cudnnTensorDescriptor_t destDesc,
183
+ const int requestedAlgoCount,
184
+ int *returnedAlgoCount,
185
+ cudnnConvolutionFwdAlgoPerf_t *perfResults);
186
+
187
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
188
+ cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
189
+ const cudnnTensorDescriptor_t xDesc,
190
+ const cudnnFilterDescriptor_t wDesc,
191
+ const cudnnConvolutionDescriptor_t convDesc,
192
+ const cudnnTensorDescriptor_t yDesc,
193
+ const int requestedAlgoCount,
194
+ int *returnedAlgoCount,
195
+ cudnnConvolutionFwdAlgoPerf_t *perfResults);
196
+
197
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
198
+ cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
199
+ const cudnnTensorDescriptor_t xDesc,
200
+ const void *x,
201
+ const cudnnFilterDescriptor_t wDesc,
202
+ const void *w,
203
+ const cudnnConvolutionDescriptor_t convDesc,
204
+ const cudnnTensorDescriptor_t yDesc,
205
+ void *y,
206
+ const int requestedAlgoCount,
207
+ int *returnedAlgoCount,
208
+ cudnnConvolutionFwdAlgoPerf_t *perfResults,
209
+ void *workSpace,
210
+ size_t workSpaceSizeInBytes);
211
+
212
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
213
+ cudnnIm2Col(cudnnHandle_t handle,
214
+ const cudnnTensorDescriptor_t xDesc,
215
+ const void *x,
216
+ const cudnnFilterDescriptor_t wDesc,
217
+ const cudnnConvolutionDescriptor_t convDesc,
218
+ void *colBuffer);
219
+
220
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
221
+ cudnnReorderFilterAndBias(cudnnHandle_t handle,
222
+ const cudnnFilterDescriptor_t filterDesc,
223
+ cudnnReorderType_t reorderType,
224
+ const void *filterData,
225
+ void *reorderedFilterData,
226
+ int reorderBias,
227
+ const void *biasData,
228
+ void *reorderedBiasData);
229
+
230
+ /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
231
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
232
+ cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
233
+ const cudnnTensorDescriptor_t xDesc,
234
+ const cudnnFilterDescriptor_t wDesc,
235
+ const cudnnConvolutionDescriptor_t convDesc,
236
+ const cudnnTensorDescriptor_t yDesc,
237
+ cudnnConvolutionFwdAlgo_t algo,
238
+ size_t *sizeInBytes);
239
+
240
+ /* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
241
+
242
+ /* Function to perform the forward pass for batch convolution */
243
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
244
+ cudnnConvolutionForward(cudnnHandle_t handle,
245
+ const void *alpha,
246
+ const cudnnTensorDescriptor_t xDesc,
247
+ const void *x,
248
+ const cudnnFilterDescriptor_t wDesc,
249
+ const void *w,
250
+ const cudnnConvolutionDescriptor_t convDesc,
251
+ cudnnConvolutionFwdAlgo_t algo,
252
+ void *workSpace,
253
+ size_t workSpaceSizeInBytes,
254
+ const void *beta,
255
+ const cudnnTensorDescriptor_t yDesc,
256
+ void *y);
257
+
258
+ /* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
259
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
260
+ cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
261
+ const void *alpha1,
262
+ const cudnnTensorDescriptor_t xDesc,
263
+ const void *x,
264
+ const cudnnFilterDescriptor_t wDesc,
265
+ const void *w,
266
+ const cudnnConvolutionDescriptor_t convDesc,
267
+ cudnnConvolutionFwdAlgo_t algo,
268
+ void *workSpace,
269
+ size_t workSpaceSizeInBytes,
270
+ const void *alpha2,
271
+ const cudnnTensorDescriptor_t zDesc,
272
+ const void *z,
273
+ const cudnnTensorDescriptor_t biasDesc,
274
+ const void *bias,
275
+ const cudnnActivationDescriptor_t activationDesc,
276
+ const cudnnTensorDescriptor_t yDesc,
277
+ void *y);
278
+
279
+ /* helper function to provide the convolution backward data algo that fit best the requirement */
280
+
281
+ typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
282
+ cudnnConvolutionBwdDataAlgo_t algo;
283
+ cudnnStatus_t status;
284
+ float time;
285
+ size_t memory;
286
+ cudnnDeterminism_t determinism;
287
+ cudnnMathType_t mathType;
288
+ int reserved[3];
289
+ } cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED;
290
+
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
293
+
294
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
295
+ cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
296
+ const cudnnFilterDescriptor_t wDesc,
297
+ const cudnnTensorDescriptor_t dyDesc,
298
+ const cudnnConvolutionDescriptor_t convDesc,
299
+ const cudnnTensorDescriptor_t dxDesc,
300
+ const int requestedAlgoCount,
301
+ int *returnedAlgoCount,
302
+ cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
303
+
304
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
305
+ cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
306
+ const cudnnFilterDescriptor_t wDesc,
307
+ const void *w,
308
+ const cudnnTensorDescriptor_t dyDesc,
309
+ const void *dy,
310
+ const cudnnConvolutionDescriptor_t convDesc,
311
+ const cudnnTensorDescriptor_t dxDesc,
312
+ void *dx,
313
+ const int requestedAlgoCount,
314
+ int *returnedAlgoCount,
315
+ cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
316
+ void *workSpace,
317
+ size_t workSpaceSizeInBytes);
318
+
319
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
320
+ cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
321
+ const cudnnFilterDescriptor_t filterDesc,
322
+ const cudnnTensorDescriptor_t diffDesc,
323
+ const cudnnConvolutionDescriptor_t convDesc,
324
+ const cudnnTensorDescriptor_t gradDesc,
325
+ const int requestedAlgoCount,
326
+ int *returnedAlgoCount,
327
+ cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
328
+
329
+ /*
330
+ * convolution algorithm (which requires potentially some workspace)
331
+ */
332
+
333
+ /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
336
+ const cudnnFilterDescriptor_t wDesc,
337
+ const cudnnTensorDescriptor_t dyDesc,
338
+ const cudnnConvolutionDescriptor_t convDesc,
339
+ const cudnnTensorDescriptor_t dxDesc,
340
+ cudnnConvolutionBwdDataAlgo_t algo,
341
+ size_t *sizeInBytes);
342
+
343
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
344
+ cudnnConvolutionBackwardData(cudnnHandle_t handle,
345
+ const void *alpha,
346
+ const cudnnFilterDescriptor_t wDesc,
347
+ const void *w,
348
+ const cudnnTensorDescriptor_t dyDesc,
349
+ const void *dy,
350
+ const cudnnConvolutionDescriptor_t convDesc,
351
+ cudnnConvolutionBwdDataAlgo_t algo,
352
+ void *workSpace,
353
+ size_t workSpaceSizeInBytes,
354
+ const void *beta,
355
+ const cudnnTensorDescriptor_t dxDesc,
356
+ void *dx);
357
+
358
+ /* Helper function to calculate folding descriptors for dgrad */
359
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
360
+ cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
361
+ const cudnnFilterDescriptor_t filterDesc,
362
+ const cudnnTensorDescriptor_t diffDesc,
363
+ const cudnnConvolutionDescriptor_t convDesc,
364
+ const cudnnTensorDescriptor_t gradDesc,
365
+ const cudnnTensorFormat_t transformFormat,
366
+ cudnnFilterDescriptor_t foldedFilterDesc,
367
+ cudnnTensorDescriptor_t paddedDiffDesc,
368
+ cudnnConvolutionDescriptor_t foldedConvDesc,
369
+ cudnnTensorDescriptor_t foldedGradDesc,
370
+ cudnnTensorTransformDescriptor_t filterFoldTransDesc,
371
+ cudnnTensorTransformDescriptor_t diffPadTransDesc,
372
+ cudnnTensorTransformDescriptor_t gradFoldTransDesc,
373
+ cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
374
+
375
+ /* cudnnFusedOps... */
376
+ struct cudnnFusedOpsConstParamStruct;
377
+ typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED;
378
+
379
+ struct cudnnFusedOpsVariantParamStruct;
380
+ typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED;
381
+
382
+ struct cudnnFusedOpsPlanStruct;
383
+ typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED;
384
+
385
+ typedef enum {
386
+ /* each op in [ ] can be disabled by passing NULL ptr */
387
+ /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
388
+ CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
389
+ /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
390
+ CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
391
+ /* utility for BN training in BN-conv fusion */
392
+ /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
393
+ /* optionally update running stats and generate saved stats */
394
+ CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
395
+ /* utility for BN inference in BN-conv fusion */
396
+ /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
397
+ CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
398
+ /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
399
+ CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
400
+ /* reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask */
401
+ CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
402
+ /* reserved for future use */
403
+ CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
404
+ } cudnnFusedOps_t CUDNN_DEPRECATED;
405
+
406
+ typedef enum {
407
+ /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
408
+ /* get XDESC: pass previously created cudnnTensorDescriptor_t */
409
+ CUDNN_PARAM_XDESC = 0,
410
+ /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
411
+ CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
412
+ /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
413
+ CUDNN_PARAM_BN_MODE = 2,
414
+ /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
415
+ /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
416
+ CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
417
+ /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
418
+ CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
419
+ /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
420
+ CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
421
+ /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
422
+ /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
423
+ CUDNN_PARAM_ACTIVATION_DESC = 6,
424
+ /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
425
+ /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
426
+ CUDNN_PARAM_CONV_DESC = 7,
427
+ /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
428
+ /* get WDESC: pass previously created cudnnFilterDescriptor_t */
429
+ CUDNN_PARAM_WDESC = 8,
430
+ /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
431
+ CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
432
+ /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
433
+ /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
434
+ CUDNN_PARAM_DWDESC = 10,
435
+ /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
436
+ CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
437
+ /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
438
+ /* get YDESC: pass previously created cudnnTensorDescriptor_t */
439
+ CUDNN_PARAM_YDESC = 12,
440
+ /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
441
+ CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
442
+ /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
443
+ /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
444
+ CUDNN_PARAM_DYDESC = 14,
445
+ /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
446
+ CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
447
+ /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
448
+ /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
449
+ CUDNN_PARAM_YSTATS_DESC = 16,
450
+ /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
451
+ CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
452
+ /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
453
+ CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
454
+ /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
455
+ /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
456
+ CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
457
+ /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
458
+ CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
459
+ /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
460
+ CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
461
+ /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
462
+ CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
463
+ /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
464
+ CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
465
+ /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
466
+ CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
467
+ /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
468
+ CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
469
+
470
+ /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
471
+ /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
472
+ CUDNN_PARAM_ZDESC = 26,
473
+ /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
474
+ CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
475
+ /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
476
+ /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
477
+ CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
478
+ /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
479
+ CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
480
+ /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
481
+ CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
482
+
483
+ /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
484
+ /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
485
+ CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
486
+ /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
487
+ CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
488
+
489
+ /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
490
+ /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
491
+ CUDNN_PARAM_DXDESC = 33,
492
+ /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
493
+ CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
494
+ /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
495
+ /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
496
+ CUDNN_PARAM_DZDESC = 35,
497
+ /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
498
+ CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
499
+ /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
500
+ CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
501
+ /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
502
+ CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
503
+ } cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED;
504
+
505
+ typedef enum {
506
+ CUDNN_PTR_NULL = 0,
507
+ CUDNN_PTR_ELEM_ALIGNED = 1,
508
+ CUDNN_PTR_16B_ALIGNED = 2,
509
+ } cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED;
510
+
511
+ typedef enum {
512
+ /* set: pass void* pointing to dev memory */
513
+ /* get: pass void** pointing to host memory */
514
+ CUDNN_PTR_XDATA = 0,
515
+ CUDNN_PTR_BN_EQSCALE = 1,
516
+ CUDNN_PTR_BN_EQBIAS = 2,
517
+ CUDNN_PTR_WDATA = 3,
518
+ CUDNN_PTR_DWDATA = 4,
519
+ CUDNN_PTR_YDATA = 5,
520
+ CUDNN_PTR_DYDATA = 6,
521
+ CUDNN_PTR_YSUM = 7,
522
+ CUDNN_PTR_YSQSUM = 8,
523
+ CUDNN_PTR_WORKSPACE = 9,
524
+ CUDNN_PTR_BN_SCALE = 10,
525
+ CUDNN_PTR_BN_BIAS = 11,
526
+ CUDNN_PTR_BN_SAVED_MEAN = 12,
527
+ CUDNN_PTR_BN_SAVED_INVSTD = 13,
528
+ CUDNN_PTR_BN_RUNNING_MEAN = 14,
529
+ CUDNN_PTR_BN_RUNNING_VAR = 15,
530
+ CUDNN_PTR_ZDATA = 16,
531
+ CUDNN_PTR_BN_Z_EQSCALE = 17,
532
+ CUDNN_PTR_BN_Z_EQBIAS = 18,
533
+ CUDNN_PTR_ACTIVATION_BITMASK = 19,
534
+ CUDNN_PTR_DXDATA = 20,
535
+ CUDNN_PTR_DZDATA = 21,
536
+ CUDNN_PTR_BN_DSCALE = 22,
537
+ CUDNN_PTR_BN_DBIAS = 23,
538
+
539
+ /* set/get: pass size_t* pointing to host memory */
540
+ CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
541
+ /* set/get: pass int64_t* pointing to host memory */
542
+ CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
543
+ /* set/get: pass double* pointing to host memory */
544
+ CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
545
+ /* set/get: pass double* pointing to host memory */
546
+ CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
547
+ } cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED;
548
+
549
+ cudnnStatus_t CUDNNWINAPI
550
+ cudnnCnnVersionCheck(void);
551
+
552
+ /* helper function to provide the convolution backward filter algo that fit best the requirement */
553
+
554
+ typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
555
+ cudnnConvolutionBwdFilterAlgo_t algo;
556
+ cudnnStatus_t status;
557
+ float time;
558
+ size_t memory;
559
+ cudnnDeterminism_t determinism;
560
+ cudnnMathType_t mathType;
561
+ int reserved[3];
562
+ } cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED;
563
+
564
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
565
+ cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
566
+
567
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
568
+ cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
569
+ const cudnnTensorDescriptor_t xDesc,
570
+ const cudnnTensorDescriptor_t dyDesc,
571
+ const cudnnConvolutionDescriptor_t convDesc,
572
+ const cudnnFilterDescriptor_t dwDesc,
573
+ const int requestedAlgoCount,
574
+ int *returnedAlgoCount,
575
+ cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
576
+
577
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
578
+ cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
579
+ const cudnnTensorDescriptor_t xDesc,
580
+ const void *x,
581
+ const cudnnTensorDescriptor_t dyDesc,
582
+ const void *y,
583
+ const cudnnConvolutionDescriptor_t convDesc,
584
+ const cudnnFilterDescriptor_t dwDesc,
585
+ void *dw,
586
+ const int requestedAlgoCount,
587
+ int *returnedAlgoCount,
588
+ cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
589
+ void *workSpace,
590
+ size_t workSpaceSizeInBytes);
591
+
592
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
593
+ cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
594
+ const cudnnTensorDescriptor_t srcDesc,
595
+ const cudnnTensorDescriptor_t diffDesc,
596
+ const cudnnConvolutionDescriptor_t convDesc,
597
+ const cudnnFilterDescriptor_t gradDesc,
598
+ const int requestedAlgoCount,
599
+ int *returnedAlgoCount,
600
+ cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
601
+
602
+ /*
603
+ * convolution algorithm (which requires potentially some workspace)
604
+ */
605
+
606
+ /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
607
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
608
+ cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
609
+ const cudnnTensorDescriptor_t xDesc,
610
+ const cudnnTensorDescriptor_t dyDesc,
611
+ const cudnnConvolutionDescriptor_t convDesc,
612
+ const cudnnFilterDescriptor_t gradDesc,
613
+ cudnnConvolutionBwdFilterAlgo_t algo,
614
+ size_t *sizeInBytes);
615
+
616
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
617
+ cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
618
+ const void *alpha,
619
+ const cudnnTensorDescriptor_t xDesc,
620
+ const void *x,
621
+ const cudnnTensorDescriptor_t dyDesc,
622
+ const void *dy,
623
+ const cudnnConvolutionDescriptor_t convDesc,
624
+ cudnnConvolutionBwdFilterAlgo_t algo,
625
+ void *workSpace,
626
+ size_t workSpaceSizeInBytes,
627
+ const void *beta,
628
+ const cudnnFilterDescriptor_t dwDesc,
629
+ void *dw);
630
+
631
+ /* Function to compute the bias gradient for batch convolution */
632
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
633
+ cudnnConvolutionBackwardBias(cudnnHandle_t handle,
634
+ const void *alpha,
635
+ const cudnnTensorDescriptor_t dyDesc,
636
+ const void *dy,
637
+ const void *beta,
638
+ const cudnnTensorDescriptor_t dbDesc,
639
+ void *db);
640
+
641
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
642
+ cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
643
+
644
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
645
+ cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
646
+
647
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
648
+ cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
649
+ cudnnFusedOpsConstParamLabel_t paramLabel,
650
+ const void *param);
651
+
652
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
653
+ cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
654
+ cudnnFusedOpsConstParamLabel_t paramLabel,
655
+ void *param,
656
+ int *isNULL);
657
+
658
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
659
+ cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
660
+
661
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
662
+ cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
663
+
664
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
665
+ cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
666
+ cudnnFusedOpsVariantParamLabel_t paramLabel,
667
+ void *ptr);
668
+
669
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
670
+ cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
671
+ cudnnFusedOpsVariantParamLabel_t paramLabel,
672
+ void *ptr);
673
+
674
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
675
+ cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
676
+
677
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
678
+ cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
679
+
680
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
681
+ cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
682
+ cudnnFusedOpsPlan_t plan,
683
+ const cudnnFusedOpsConstParamPack_t constPack,
684
+ size_t *workspaceSizeInBytes);
685
+
686
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
687
+ cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
688
+
689
+ #if defined(__cplusplus)
690
+ }
691
+ #endif
692
+
693
+ #endif /* CUDNN_CNN_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h ADDED
@@ -0,0 +1,992 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_graph : cuDNN's basic definitions operations.
52
+ */
53
+
54
+ #if !defined(CUDNN_GRAPH_H_)
55
+ #define CUDNN_GRAPH_H_
56
+
57
+ #include <cuda_runtime_api.h>
58
+ #include <library_types.h>
59
+
60
+ #include <stdint.h>
61
+
62
+ #include "cudnn_version.h"
63
+
64
+ /* These version numbers are autogenerated, do not edit manually. */
65
+ #define CUDNN_GRAPH_MAJOR 9
66
+ #define CUDNN_GRAPH_MINOR 10
67
+ #define CUDNN_GRAPH_PATCH 2
68
+
69
+ #if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
70
+ #error Version mismatch in cuDNN GRAPH!!!
71
+ #endif
72
+
73
+ #ifndef CUDNNWINAPI
74
+ #ifdef _WIN32
75
+ #define CUDNNWINAPI __stdcall
76
+ #else
77
+ #define CUDNNWINAPI
78
+ #endif
79
+ #endif
80
+
81
+ /* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
82
+ #if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
83
+ /* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
84
+ #define CUDNN_DEPRECATED __attribute__((deprecated))
85
+ #define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
86
+ #elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
87
+ /* Microsoft Visual C++ */
88
+ #define CUDNN_DEPRECATED __declspec(deprecated)
89
+ #define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
90
+ #elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
91
+ /* C++14 compilers */
92
+ #define CUDNN_DEPRECATED [[deprecated]]
93
+ #define CUDNN_DEPRECATED_ENUM [[deprecated]]
94
+ #else
95
+ /* No support for the deprecated attribute */
96
+ #define CUDNN_DEPRECATED
97
+ #define CUDNN_DEPRECATED_ENUM
98
+ #endif
99
+
100
+ #if defined(__cplusplus)
101
+ extern "C" {
102
+ #endif
103
+
104
+ struct cudnnContext;
105
+ typedef struct cudnnContext *cudnnHandle_t;
106
+
107
+ size_t CUDNNWINAPI
108
+ cudnnGetVersion(void);
109
+
110
+ size_t CUDNNWINAPI
111
+ cudnnGetMaxDeviceVersion(void);
112
+
113
+ /* Returns CUDA Runtime version statically linked against cudnn */
114
+ size_t CUDNNWINAPI
115
+ cudnnGetCudartVersion(void);
116
+
117
+ /*
118
+ * CUDNN return codes
119
+ */
120
+ typedef enum {
121
+ CUDNN_STATUS_SUCCESS = 0,
122
+
123
+ /* Uncategorized errors */
124
+ CUDNN_STATUS_NOT_INITIALIZED = 1001,
125
+ CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH = 1002,
126
+ CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
127
+ CUDNN_STATUS_DEPRECATED = 1004,
128
+ CUDNN_STATUS_LICENSE_ERROR = 1005,
129
+ CUDNN_STATUS_RUNTIME_IN_PROGRESS = 1006,
130
+ CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 1007,
131
+ CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED = 1008,
132
+
133
+ CUDNN_STATUS_BAD_PARAM = 2000,
134
+ CUDNN_STATUS_BAD_PARAM_NULL_POINTER = 2002,
135
+ CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER = 2003,
136
+ CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED = 2004,
137
+ CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND = 2005,
138
+ CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT = 2006,
139
+ CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH = 2007,
140
+ CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH = 2008,
141
+ CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009,
142
+ CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE = 2010,
143
+ CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011,
144
+ CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE = 2012,
145
+
146
+ CUDNN_STATUS_NOT_SUPPORTED = 3000,
147
+ CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN = 3001,
148
+ CUDNN_STATUS_NOT_SUPPORTED_SHAPE = 3002,
149
+ CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE = 3003,
150
+ CUDNN_STATUS_NOT_SUPPORTED_LAYOUT = 3004,
151
+ CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER = 3005,
152
+ CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART = 3006,
153
+ CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH = 3007,
154
+ CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
155
+ CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE = 3009,
156
+ CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT = 3010,
157
+ CUDNN_STATUS_NOT_SUPPORTED_PADDING = 3011,
158
+ CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM = 3012,
159
+ CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API = 3013,
160
+
161
+ CUDNN_STATUS_INTERNAL_ERROR = 4000,
162
+ CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED = 4001,
163
+ CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE = 4002,
164
+ CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED = 4003,
165
+ CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
166
+ CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM = 4005,
167
+ CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED = 4006,
168
+
169
+ CUDNN_STATUS_EXECUTION_FAILED = 5000,
170
+ CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
171
+ CUDNN_STATUS_EXECUTION_FAILED_CUBLAS = 5002,
172
+ CUDNN_STATUS_EXECUTION_FAILED_CUDART = 5003,
173
+ CUDNN_STATUS_EXECUTION_FAILED_CURAND = 5004,
174
+
175
+ CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
176
+ CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
177
+ CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
178
+ CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
179
+ CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
180
+ CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
181
+ CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
182
+ } cudnnStatus_t;
183
+
184
+ #define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
185
+ #define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
186
+ #define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
187
+
188
+ /* human-readable error messages */
189
+ const char *CUDNNWINAPI
190
+ cudnnGetErrorString(cudnnStatus_t status);
191
+
192
+ void CUDNNWINAPI
193
+ cudnnGetLastErrorString(char *message, size_t max_size);
194
+
195
+ /* Forward definition in this version only */
196
+ typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
197
+
198
+ typedef enum {
199
+ CUDNN_ERRQUERY_RAWCODE = 0,
200
+ CUDNN_ERRQUERY_NONBLOCKING = 1,
201
+ CUDNN_ERRQUERY_BLOCKING = 2,
202
+ } cudnnErrQueryMode_t;
203
+
204
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
205
+ cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
206
+
207
+ cudnnStatus_t CUDNNWINAPI
208
+ cudnnGetProperty(libraryPropertyType type, int *value);
209
+
210
+ cudnnStatus_t CUDNNWINAPI
211
+ cudnnCreate(cudnnHandle_t *handle);
212
+ cudnnStatus_t CUDNNWINAPI
213
+ cudnnDestroy(cudnnHandle_t handle);
214
+ cudnnStatus_t CUDNNWINAPI
215
+ cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
216
+ cudnnStatus_t CUDNNWINAPI
217
+ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
218
+ /*
219
+ * CUDNN data type
220
+ */
221
+ typedef enum {
222
+ CUDNN_DATA_FLOAT = 0,
223
+ CUDNN_DATA_DOUBLE = 1,
224
+ CUDNN_DATA_HALF = 2,
225
+ CUDNN_DATA_INT8 = 3,
226
+ CUDNN_DATA_INT32 = 4,
227
+ CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM = 5,
228
+ CUDNN_DATA_UINT8 = 6,
229
+ CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
230
+ CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
231
+ CUDNN_DATA_BFLOAT16 = 9,
232
+ CUDNN_DATA_INT64 = 10,
233
+ CUDNN_DATA_BOOLEAN = 11,
234
+ CUDNN_DATA_FP8_E4M3 = 12,
235
+ CUDNN_DATA_FP8_E5M2 = 13,
236
+ CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
237
+ CUDNN_DATA_FP8_E8M0 = 15,
238
+ CUDNN_DATA_FP4_E2M1 = 16,
239
+ } cudnnDataType_t;
240
+
241
+ /*
242
+ * CUDNN math type
243
+ */
244
+ typedef enum {
245
+ CUDNN_DEFAULT_MATH = 0,
246
+ CUDNN_TENSOR_OP_MATH = 1,
247
+ CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
248
+ CUDNN_FMA_MATH = 3,
249
+ } cudnnMathType_t;
250
+
251
+ /*
252
+ * CUDNN propagate Nan
253
+ */
254
+ typedef enum {
255
+ CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
256
+ CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 1,
257
+ } cudnnNanPropagation_t;
258
+
259
+ /*
260
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
261
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
262
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
263
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
264
+ */
265
+ typedef enum {
266
+ CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
267
+ CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
268
+ } cudnnCTCGradMode_t;
269
+
270
+ typedef enum {
271
+ CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */
272
+ CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/
273
+ CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
274
+ } cudnnTensorFormat_t;
275
+
276
+ /*
277
+ * CUDNN ReduceTensor op type
278
+ */
279
+ typedef enum {
280
+ CUDNN_REDUCE_TENSOR_ADD = 0,
281
+ CUDNN_REDUCE_TENSOR_MUL = 1,
282
+ CUDNN_REDUCE_TENSOR_MIN = 2,
283
+ CUDNN_REDUCE_TENSOR_MAX = 3,
284
+ CUDNN_REDUCE_TENSOR_AMAX = 4,
285
+ CUDNN_REDUCE_TENSOR_AVG = 5,
286
+ CUDNN_REDUCE_TENSOR_NORM1 = 6,
287
+ CUDNN_REDUCE_TENSOR_NORM2 = 7,
288
+ CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
289
+ } cudnnReduceTensorOp_t;
290
+
291
+ /*
292
+ * activation mode
293
+ */
294
+ typedef enum {
295
+ CUDNN_ACTIVATION_SIGMOID = 0,
296
+ CUDNN_ACTIVATION_RELU = 1,
297
+ CUDNN_ACTIVATION_TANH = 2,
298
+ CUDNN_ACTIVATION_CLIPPED_RELU = 3,
299
+ CUDNN_ACTIVATION_ELU = 4,
300
+ CUDNN_ACTIVATION_IDENTITY = 5,
301
+ CUDNN_ACTIVATION_SWISH = 6
302
+ } cudnnActivationMode_t CUDNN_DEPRECATED;
303
+
304
+ typedef enum {
305
+ CUDNN_SEV_FATAL = 0,
306
+ CUDNN_SEV_ERROR = 1,
307
+ CUDNN_SEV_WARNING = 2,
308
+ CUDNN_SEV_INFO = 3,
309
+ } cudnnSeverity_t;
310
+
311
+ /* Message masks to be used with cudnnSetCallback() */
312
+ #define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
313
+ #define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
314
+ #define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
315
+
316
+ /* struct containing useful informaiton for each API call */
317
+ typedef struct cudnnDebugStruct {
318
+ unsigned cudnn_version;
319
+ cudnnStatus_t cudnnStatus;
320
+ unsigned time_sec; /* epoch time in seconds */
321
+ unsigned time_usec; /* microseconds part of epoch time */
322
+ unsigned time_delta; /* time since start in seconds */
323
+ cudnnHandle_t handle; /* cudnn handle */
324
+ cudaStream_t stream; /* cuda stream ID */
325
+ unsigned long long pid; /* process ID */
326
+ unsigned long long tid; /* thread ID */
327
+ int cudaDeviceId; /* CUDA device ID */
328
+ int reserved[15]; /* reserved for future use */
329
+ } cudnnDebug_t;
330
+
331
+ typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
332
+
333
+ cudnnStatus_t CUDNNWINAPI
334
+ cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
335
+
336
+ cudnnStatus_t CUDNNWINAPI
337
+ cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
338
+
339
+ /*
340
+ * \brief Cross-library version checker.
341
+ * This function is implemented differently in each sub-library. Each sublib
342
+ * checks whether its own version matches that of its dependencies.
343
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
344
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
345
+ */
346
+ cudnnStatus_t CUDNNWINAPI
347
+ cudnnGraphVersionCheck(void);
348
+
349
+ /* Maximum supported number of tensor dimensions */
350
+ #define CUDNN_DIM_MAX 8
351
+
352
+ /*
353
+ * convolution mode
354
+ */
355
+ typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
356
+
357
+ /*
358
+ * CUDNN Reorder
359
+ */
360
+ typedef enum {
361
+ CUDNN_DEFAULT_REORDER = 0,
362
+ CUDNN_NO_REORDER = 1,
363
+ } cudnnReorderType_t CUDNN_DEPRECATED;
364
+
365
+ typedef void *cudnnBackendDescriptor_t;
366
+
367
+ typedef struct cudnnFractionStruct {
368
+ int64_t numerator;
369
+ int64_t denominator;
370
+ } cudnnFraction_t;
371
+
372
+ typedef enum {
373
+ CUDNN_POINTWISE_ADD = 0,
374
+ CUDNN_POINTWISE_ADD_SQUARE = 5,
375
+ CUDNN_POINTWISE_DIV = 6,
376
+ CUDNN_POINTWISE_MAX = 3,
377
+ CUDNN_POINTWISE_MIN = 2,
378
+ CUDNN_POINTWISE_MOD = 7,
379
+ CUDNN_POINTWISE_MUL = 1,
380
+ CUDNN_POINTWISE_POW = 8,
381
+ CUDNN_POINTWISE_SUB = 9,
382
+
383
+ CUDNN_POINTWISE_ABS = 10,
384
+ CUDNN_POINTWISE_CEIL = 11,
385
+ CUDNN_POINTWISE_COS = 12,
386
+ CUDNN_POINTWISE_EXP = 13,
387
+ CUDNN_POINTWISE_FLOOR = 14,
388
+ CUDNN_POINTWISE_LOG = 15,
389
+ CUDNN_POINTWISE_NEG = 16,
390
+ CUDNN_POINTWISE_RSQRT = 17,
391
+ CUDNN_POINTWISE_SIN = 18,
392
+ CUDNN_POINTWISE_SQRT = 4,
393
+ CUDNN_POINTWISE_TAN = 19,
394
+ CUDNN_POINTWISE_ERF = 20,
395
+ CUDNN_POINTWISE_IDENTITY = 21,
396
+ CUDNN_POINTWISE_RECIPROCAL = 22,
397
+ CUDNN_POINTWISE_ATAN2 = 23,
398
+
399
+ CUDNN_POINTWISE_RELU_FWD = 100,
400
+ CUDNN_POINTWISE_TANH_FWD = 101,
401
+ CUDNN_POINTWISE_SIGMOID_FWD = 102,
402
+ CUDNN_POINTWISE_ELU_FWD = 103,
403
+ CUDNN_POINTWISE_GELU_FWD = 104,
404
+ CUDNN_POINTWISE_SOFTPLUS_FWD = 105,
405
+ CUDNN_POINTWISE_SWISH_FWD = 106,
406
+ CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
407
+
408
+ CUDNN_POINTWISE_RELU_BWD = 200,
409
+ CUDNN_POINTWISE_TANH_BWD = 201,
410
+ CUDNN_POINTWISE_SIGMOID_BWD = 202,
411
+ CUDNN_POINTWISE_ELU_BWD = 203,
412
+ CUDNN_POINTWISE_GELU_BWD = 204,
413
+ CUDNN_POINTWISE_SOFTPLUS_BWD = 205,
414
+ CUDNN_POINTWISE_SWISH_BWD = 206,
415
+ CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
416
+
417
+ CUDNN_POINTWISE_CMP_EQ = 300,
418
+ CUDNN_POINTWISE_CMP_NEQ = 301,
419
+ CUDNN_POINTWISE_CMP_GT = 302,
420
+ CUDNN_POINTWISE_CMP_GE = 303,
421
+ CUDNN_POINTWISE_CMP_LT = 304,
422
+ CUDNN_POINTWISE_CMP_LE = 305,
423
+
424
+ CUDNN_POINTWISE_LOGICAL_AND = 400,
425
+ CUDNN_POINTWISE_LOGICAL_OR = 401,
426
+ CUDNN_POINTWISE_LOGICAL_NOT = 402,
427
+
428
+ CUDNN_POINTWISE_GEN_INDEX = 501,
429
+
430
+ CUDNN_POINTWISE_BINARY_SELECT = 601,
431
+ } cudnnPointwiseMode_t;
432
+
433
+ typedef enum {
434
+ CUDNN_RESAMPLE_NEAREST = 0,
435
+ CUDNN_RESAMPLE_BILINEAR = 1,
436
+ CUDNN_RESAMPLE_AVGPOOL = 2,
437
+ CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
438
+ CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
439
+ CUDNN_RESAMPLE_MAXPOOL = 3,
440
+ } cudnnResampleMode_t;
441
+
442
+ typedef enum {
443
+ CUDNN_SIGNAL_SET = 0,
444
+ CUDNN_SIGNAL_WAIT = 1,
445
+ } cudnnSignalMode_t;
446
+
447
+ typedef enum {
448
+ CUDNN_GENSTATS_SUM_SQSUM = 0,
449
+ } cudnnGenStatsMode_t;
450
+
451
+ typedef enum {
452
+ CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0,
453
+ CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
454
+ } cudnnBnFinalizeStatsMode_t;
455
+
456
+ typedef enum {
457
+ CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0,
458
+ CUDNN_RNG_DISTRIBUTION_UNIFORM = 1,
459
+ CUDNN_RNG_DISTRIBUTION_NORMAL = 2,
460
+ } cudnnRngDistribution_t;
461
+
462
+ typedef enum {
463
+ CUDNN_ATTR_POINTWISE_MODE = 0,
464
+ CUDNN_ATTR_POINTWISE_MATH_PREC = 1,
465
+ CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
466
+ CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3,
467
+ CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4,
468
+ CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
469
+ CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6,
470
+ CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7,
471
+ CUDNN_ATTR_POINTWISE_SWISH_BETA = 8,
472
+ CUDNN_ATTR_POINTWISE_AXIS = 9,
473
+
474
+ CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100,
475
+ CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101,
476
+ CUDNN_ATTR_CONVOLUTION_DILATIONS = 102,
477
+ CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
478
+ CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104,
479
+ CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105,
480
+ CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106,
481
+
482
+ CUDNN_ATTR_ENGINEHEUR_MODE = 200,
483
+ CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
484
+ CUDNN_ATTR_ENGINEHEUR_RESULTS = 202,
485
+ CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
486
+ CUDNN_ATTR_ENGINEHEUR_DEVICEPROP = 204,
487
+
488
+ CUDNN_ATTR_ENGINECFG_ENGINE = 300,
489
+ CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
490
+ CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302,
491
+ CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE = 303,
492
+ CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304,
493
+
494
+ CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400,
495
+ CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401,
496
+ CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402,
497
+ CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
498
+ CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
499
+ CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405,
500
+ CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE = 406,
501
+ CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP = 407,
502
+
503
+ CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500,
504
+ CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501,
505
+ CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502,
506
+ CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
507
+
508
+ CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600,
509
+ CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
510
+
511
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700,
512
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701,
513
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702,
514
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703,
515
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704,
516
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705,
517
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706,
518
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707,
519
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708,
520
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709,
521
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710,
522
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711,
523
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712,
524
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713,
525
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
526
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715,
527
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716,
528
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717,
529
+
530
+ CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
531
+ CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751,
532
+ CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752,
533
+ CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753,
534
+ CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754,
535
+ CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755,
536
+ CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756,
537
+ CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757,
538
+ CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758,
539
+
540
+ CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770,
541
+ CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
542
+ CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772,
543
+ CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773,
544
+ CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
545
+
546
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780,
547
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781,
548
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782,
549
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783,
550
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784,
551
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785,
552
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786,
553
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787,
554
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
555
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789,
556
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790,
557
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791,
558
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792,
559
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793,
560
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794,
561
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795,
562
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796,
563
+
564
+ CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800,
565
+ CUDNN_ATTR_OPERATIONGRAPH_OPS = 801,
566
+ CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
567
+ CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803,
568
+ CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY = 804,
569
+
570
+ CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900,
571
+ CUDNN_ATTR_TENSOR_DATA_TYPE = 901,
572
+ CUDNN_ATTR_TENSOR_DIMENSIONS = 902,
573
+ CUDNN_ATTR_TENSOR_STRIDES = 903,
574
+ CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904,
575
+ CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
576
+ CUDNN_ATTR_TENSOR_UNIQUE_ID = 906,
577
+ CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907,
578
+ CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908,
579
+ CUDNN_ATTR_TENSOR_REORDERING_MODE = 909,
580
+ CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC = 913,
581
+
582
+ CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000,
583
+ CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
584
+ CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
585
+ CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003,
586
+
587
+ CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
588
+ CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101,
589
+
590
+ CUDNN_ATTR_KNOB_INFO_TYPE = 1200,
591
+ CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
592
+ CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
593
+ CUDNN_ATTR_KNOB_INFO_STRIDE = 1203,
594
+
595
+ CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
596
+ CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301,
597
+ CUDNN_ATTR_ENGINE_KNOB_INFO = 1302,
598
+ CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303,
599
+ CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304,
600
+ CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305,
601
+ CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
602
+ CUDNN_ATTR_ENGINE_DEVICEPROP = 1307,
603
+
604
+ CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
605
+ CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
606
+
607
+ CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520,
608
+ CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521,
609
+ CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522,
610
+ CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523,
611
+ CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
612
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525,
613
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526,
614
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527,
615
+
616
+ CUDNN_ATTR_REDUCTION_OPERATOR = 1600,
617
+ CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
618
+
619
+ CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
620
+ CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
621
+ CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612,
622
+
623
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620,
624
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621,
625
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622,
626
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623,
627
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624,
628
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625,
629
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626,
630
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627,
631
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
632
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629,
633
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630,
634
+
635
+ CUDNN_ATTR_RESAMPLE_MODE = 1700,
636
+ CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701,
637
+ CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702,
638
+ CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703,
639
+ CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704,
640
+ CUDNN_ATTR_RESAMPLE_STRIDES = 1705,
641
+ CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706,
642
+ CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
643
+ CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708,
644
+
645
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710,
646
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711,
647
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
648
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
649
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM = 1714,
650
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716,
651
+
652
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720,
653
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721,
654
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
655
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
656
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM = 1724,
657
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725,
658
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726,
659
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727,
660
+
661
+ CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800,
662
+ CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801,
663
+ CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
664
+ CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803,
665
+
666
+ CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900,
667
+ CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
668
+ CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902,
669
+ CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903,
670
+ CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904,
671
+
672
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC = 1950,
673
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC = 1951,
674
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC = 1952,
675
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,
676
+
677
+ CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000,
678
+ CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001,
679
+ CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002,
680
+ CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003,
681
+ CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004,
682
+ CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005,
683
+ CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006,
684
+ CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007,
685
+ CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008,
686
+ CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009,
687
+ CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010,
688
+ CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
689
+ CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012,
690
+ CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013,
691
+ CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014,
692
+
693
+ CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100,
694
+ CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101,
695
+ CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102,
696
+ CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
697
+ CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104,
698
+ CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105,
699
+ CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106,
700
+ CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107,
701
+ CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108,
702
+ CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109,
703
+ CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110,
704
+
705
+ CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
706
+ CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
707
+
708
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC = 2250,
709
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC = 2251,
710
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH = 2252,
711
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH = 2253,
712
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS = 2254,
713
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE = 2255,
714
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256,
715
+
716
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC = 2270,
717
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC = 2271,
718
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272,
719
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273,
720
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS = 2274,
721
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE = 2275,
722
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE = 2276,
723
+
724
+ CUDNN_ATTR_RNG_DISTRIBUTION = 2300,
725
+ CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301,
726
+ CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
727
+ CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303,
728
+ CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304,
729
+ CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305,
730
+
731
+ CUDNN_ATTR_OPERATION_RNG_YDESC = 2310,
732
+ CUDNN_ATTR_OPERATION_RNG_SEED = 2311,
733
+ CUDNN_ATTR_OPERATION_RNG_DESC = 2312,
734
+ CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
735
+
736
+ CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH = 2400,
737
+ CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401,
738
+ CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION = 2402,
739
+
740
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500,
741
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501,
742
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
743
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503,
744
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,
745
+
746
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600,
747
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
748
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602,
749
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603,
750
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,
751
+
752
+ CUDNN_ATTR_DEVICEPROP_DEVICE_ID = 2700,
753
+ CUDNN_ATTR_DEVICEPROP_HANDLE = 2701,
754
+ CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,
755
+ } cudnnBackendAttributeName_t;
756
+
757
+ typedef enum {
758
+ CUDNN_TYPE_HANDLE = 0,
759
+ CUDNN_TYPE_DATA_TYPE = 1,
760
+ CUDNN_TYPE_BOOLEAN = 2,
761
+ CUDNN_TYPE_INT64 = 3,
762
+ CUDNN_TYPE_FLOAT = 4,
763
+ CUDNN_TYPE_DOUBLE = 5,
764
+ CUDNN_TYPE_VOID_PTR = 6,
765
+ CUDNN_TYPE_CONVOLUTION_MODE = 7,
766
+ CUDNN_TYPE_HEUR_MODE = 8,
767
+ CUDNN_TYPE_KNOB_TYPE = 9,
768
+ CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10,
769
+ CUDNN_TYPE_NUMERICAL_NOTE = 11,
770
+ CUDNN_TYPE_LAYOUT_TYPE = 12,
771
+ CUDNN_TYPE_ATTRIB_NAME = 13,
772
+ CUDNN_TYPE_POINTWISE_MODE = 14,
773
+ CUDNN_TYPE_BACKEND_DESCRIPTOR = 15,
774
+ CUDNN_TYPE_GENSTATS_MODE = 16,
775
+ CUDNN_TYPE_BN_FINALIZE_STATS_MODE = 17,
776
+ CUDNN_TYPE_REDUCTION_OPERATOR_TYPE = 18,
777
+ CUDNN_TYPE_BEHAVIOR_NOTE = 19,
778
+ CUDNN_TYPE_TENSOR_REORDERING_MODE = 20,
779
+ CUDNN_TYPE_RESAMPLE_MODE = 21,
780
+ CUDNN_TYPE_PADDING_MODE = 22,
781
+ CUDNN_TYPE_INT32 = 23,
782
+ CUDNN_TYPE_CHAR = 24,
783
+ CUDNN_TYPE_SIGNAL_MODE = 25,
784
+ CUDNN_TYPE_FRACTION = 26,
785
+ CUDNN_TYPE_NORM_MODE = 27,
786
+ CUDNN_TYPE_NORM_FWD_PHASE = 28,
787
+ CUDNN_TYPE_RNG_DISTRIBUTION = 29,
788
+ } cudnnBackendAttributeType_t;
789
+
790
+ typedef enum {
791
+ CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
792
+ CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR = 1,
793
+ CUDNN_BACKEND_ENGINE_DESCRIPTOR = 2,
794
+ CUDNN_BACKEND_ENGINECFG_DESCRIPTOR = 3,
795
+ CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR = 4,
796
+ CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR = 5,
797
+ CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR = 6,
798
+ CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR = 7,
799
+ CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR = 8,
800
+ CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR = 9,
801
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR = 10,
802
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
803
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR = 12,
804
+ CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR = 13,
805
+ CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR = 14,
806
+ CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR = 15,
807
+ CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR = 16,
808
+ CUDNN_BACKEND_TENSOR_DESCRIPTOR = 17,
809
+ CUDNN_BACKEND_MATMUL_DESCRIPTOR = 18,
810
+ CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR = 19,
811
+ CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR = 20,
812
+ CUDNN_BACKEND_REDUCTION_DESCRIPTOR = 21,
813
+ CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR = 22,
814
+ CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR = 23,
815
+ CUDNN_BACKEND_RESAMPLE_DESCRIPTOR = 24,
816
+ CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR = 25,
817
+ CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR = 26,
818
+ CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR = 27,
819
+ CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR = 28,
820
+ CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR = 29,
821
+ CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR = 30,
822
+ CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR = 31,
823
+ CUDNN_BACKEND_RNG_DESCRIPTOR = 32,
824
+ CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR = 33,
825
+ CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR = 34,
826
+ CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR = 35,
827
+ CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR = 36,
828
+ CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR = 37,
829
+ CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR = 38,
830
+ CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR = 39,
831
+ CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR = 40,
832
+ } cudnnBackendDescriptorType_t;
833
+
834
+ typedef enum {
835
+ CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
836
+ CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS = 1,
837
+ CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
838
+ CUDNN_NUMERICAL_NOTE_FFT = 3,
839
+ CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC = 4,
840
+ CUDNN_NUMERICAL_NOTE_WINOGRAD = 5,
841
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4 = 6,
842
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6 = 7,
843
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13 = 8,
844
+ CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP = 9,
845
+ CUDNN_NUMERICAL_NOTE_TYPE_COUNT = 10,
846
+ } cudnnBackendNumericalNote_t;
847
+
848
+ typedef enum {
849
+ CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0,
850
+ CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
851
+ CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2,
852
+ CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3,
853
+ CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 4,
854
+ } cudnnBackendBehaviorNote_t;
855
+
856
+ typedef enum {
857
+ CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM = 0,
858
+ CUDNN_KNOB_TYPE_SWIZZLE = 1,
859
+ CUDNN_KNOB_TYPE_TILE_SIZE = 2,
860
+ CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM = 3,
861
+ CUDNN_KNOB_TYPE_EDGE = 4,
862
+ CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM = 5,
863
+ CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM = 6,
864
+ CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM = 7,
865
+ CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM = 8,
866
+ CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM = 9,
867
+ CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM = 10,
868
+ CUDNN_KNOB_TYPE_MULTIPLY = 11,
869
+ CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12,
870
+ CUDNN_KNOB_TYPE_TILEK = 13,
871
+ CUDNN_KNOB_TYPE_STAGES = 14,
872
+ CUDNN_KNOB_TYPE_REDUCTION_MODE = 15,
873
+ CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
874
+ CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17,
875
+ CUDNN_KNOB_TYPE_IDX_MODE = 18,
876
+ CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM = 19,
877
+ CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM = 20,
878
+ CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM = 21,
879
+ CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM = 22,
880
+ CUDNN_KNOB_TYPE_SPECFILT = 23,
881
+ CUDNN_KNOB_TYPE_KERNEL_CFG = 24,
882
+ CUDNN_KNOB_TYPE_WORKSPACE = 25,
883
+ CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM = 26,
884
+ CUDNN_KNOB_TYPE_TILE_CGA_M = 27,
885
+ CUDNN_KNOB_TYPE_TILE_CGA_N = 28,
886
+ CUDNN_KNOB_TYPE_BLOCK_SIZE = 29,
887
+ CUDNN_KNOB_TYPE_OCCUPANCY = 30,
888
+ CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD = 31,
889
+ CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM = 32,
890
+ CUDNN_KNOB_TYPE_SPLIT_COLS = 33,
891
+ CUDNN_KNOB_TYPE_TILE_ROWS = 34,
892
+ CUDNN_KNOB_TYPE_TILE_COLS = 35,
893
+ CUDNN_KNOB_TYPE_LOAD_SIZE = 36,
894
+ CUDNN_KNOB_TYPE_CTA_COUNT = 37,
895
+ CUDNN_KNOB_TYPE_STREAM_K = 38,
896
+ CUDNN_KNOB_TYPE_SPLIT_P_SLC = 39,
897
+ CUDNN_KNOB_TYPE_TILE_M = 40,
898
+ CUDNN_KNOB_TYPE_TILE_N = 41,
899
+ CUDNN_KNOB_TYPE_WARP_SPEC_CFG = 42,
900
+ CUDNN_KNOB_TYPE_COUNTS = 43,
901
+ } cudnnBackendKnobType_t;
902
+
903
+ typedef enum {
904
+ CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0,
905
+ CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1,
906
+ CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
907
+ CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
908
+ CUDNN_LAYOUT_TYPE_COUNT = 4,
909
+ } cudnnBackendLayoutType_t;
910
+
911
+ typedef enum {
912
+ CUDNN_HEUR_MODE_INSTANT = 0,
913
+ CUDNN_HEUR_MODE_B = 1,
914
+ CUDNN_HEUR_MODE_FALLBACK = 2,
915
+ CUDNN_HEUR_MODE_A = 3,
916
+ CUDNN_HEUR_MODES_COUNT = 4,
917
+ } cudnnBackendHeurMode_t;
918
+
919
+ typedef enum {
920
+ CUDNN_TENSOR_REORDERING_NONE = 0,
921
+ CUDNN_TENSOR_REORDERING_INT8x32 = 1,
922
+ CUDNN_TENSOR_REORDERING_F16x16 = 2,
923
+ CUDNN_TENSOR_REORDERING_F8_128x4 = 3,
924
+ } cudnnBackendTensorReordering_t;
925
+
926
+ typedef enum {
927
+ CUDNN_ZERO_PAD = 0,
928
+ CUDNN_NEG_INF_PAD = 1,
929
+ CUDNN_EDGE_VAL_PAD = 2,
930
+ } cudnnPaddingMode_t;
931
+
932
+ typedef enum {
933
+ CUDNN_LAYER_NORM = 0,
934
+ CUDNN_INSTANCE_NORM = 1,
935
+ CUDNN_BATCH_NORM = 2,
936
+ CUDNN_GROUP_NORM = 3,
937
+ CUDNN_RMS_NORM = 4,
938
+ CUDNN_ADA_LAYER_NORM = 5,
939
+ } cudnnBackendNormMode_t;
940
+
941
+ typedef enum {
942
+ CUDNN_NORM_FWD_INFERENCE = 0,
943
+ CUDNN_NORM_FWD_TRAINING = 1,
944
+ } cudnnBackendNormFwdPhase_t;
945
+
946
+ cudnnStatus_t CUDNNWINAPI
947
+ cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
948
+
949
+ cudnnStatus_t CUDNNWINAPI
950
+ cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
951
+
952
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
953
+ cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
954
+
955
+ cudnnStatus_t CUDNNWINAPI
956
+ cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
957
+
958
+ cudnnStatus_t CUDNNWINAPI
959
+ cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
960
+ cudnnBackendAttributeName_t attributeName,
961
+ cudnnBackendAttributeType_t attributeType,
962
+ int64_t elementCount,
963
+ const void *arrayOfElements);
964
+
965
+ cudnnStatus_t CUDNNWINAPI
966
+ cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
967
+ cudnnBackendAttributeName_t attributeName,
968
+ cudnnBackendAttributeType_t attributeType,
969
+ int64_t requestedElementCount,
970
+ int64_t *elementCount,
971
+ void *arrayOfElements);
972
+
973
+ cudnnStatus_t CUDNNWINAPI
974
+ cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
975
+
976
+ cudnnStatus_t CUDNNWINAPI
977
+ cudnnBackendPopulateCudaGraph(cudnnHandle_t handle,
978
+ cudnnBackendDescriptor_t executionPlan,
979
+ cudnnBackendDescriptor_t variantPack,
980
+ cudaGraph_t graph);
981
+
982
+ cudnnStatus_t CUDNNWINAPI
983
+ cudnnBackendUpdateCudaGraph(cudnnHandle_t handle,
984
+ cudnnBackendDescriptor_t executionPlan,
985
+ cudnnBackendDescriptor_t variantPack,
986
+ cudaGraph_t graph);
987
+
988
+ #if defined(__cplusplus)
989
+ }
990
+ #endif
991
+
992
+ #endif /* CUDNN_GRAPH_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h ADDED
@@ -0,0 +1,992 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_graph : cuDNN's basic definitions operations.
52
+ */
53
+
54
+ #if !defined(CUDNN_GRAPH_H_)
55
+ #define CUDNN_GRAPH_H_
56
+
57
+ #include <cuda_runtime_api.h>
58
+ #include <library_types.h>
59
+
60
+ #include <stdint.h>
61
+
62
+ #include "cudnn_version.h"
63
+
64
+ /* These version numbers are autogenerated, do not edit manually. */
65
+ #define CUDNN_GRAPH_MAJOR 9
66
+ #define CUDNN_GRAPH_MINOR 10
67
+ #define CUDNN_GRAPH_PATCH 2
68
+
69
+ #if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
70
+ #error Version mismatch in cuDNN GRAPH!!!
71
+ #endif
72
+
73
+ #ifndef CUDNNWINAPI
74
+ #ifdef _WIN32
75
+ #define CUDNNWINAPI __stdcall
76
+ #else
77
+ #define CUDNNWINAPI
78
+ #endif
79
+ #endif
80
+
81
+ /* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
82
+ #if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
83
+ /* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
84
+ #define CUDNN_DEPRECATED __attribute__((deprecated))
85
+ #define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
86
+ #elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
87
+ /* Microsoft Visual C++ */
88
+ #define CUDNN_DEPRECATED __declspec(deprecated)
89
+ #define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
90
+ #elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
91
+ /* C++14 compilers */
92
+ #define CUDNN_DEPRECATED [[deprecated]]
93
+ #define CUDNN_DEPRECATED_ENUM [[deprecated]]
94
+ #else
95
+ /* No support for the deprecated attribute */
96
+ #define CUDNN_DEPRECATED
97
+ #define CUDNN_DEPRECATED_ENUM
98
+ #endif
99
+
100
+ #if defined(__cplusplus)
101
+ extern "C" {
102
+ #endif
103
+
104
+ struct cudnnContext;
105
+ typedef struct cudnnContext *cudnnHandle_t;
106
+
107
+ size_t CUDNNWINAPI
108
+ cudnnGetVersion(void);
109
+
110
+ size_t CUDNNWINAPI
111
+ cudnnGetMaxDeviceVersion(void);
112
+
113
+ /* Returns CUDA Runtime version statically linked against cudnn */
114
+ size_t CUDNNWINAPI
115
+ cudnnGetCudartVersion(void);
116
+
117
+ /*
118
+ * CUDNN return codes
119
+ */
120
+ typedef enum {
121
+ CUDNN_STATUS_SUCCESS = 0,
122
+
123
+ /* Uncategorized errors */
124
+ CUDNN_STATUS_NOT_INITIALIZED = 1001,
125
+ CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH = 1002,
126
+ CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
127
+ CUDNN_STATUS_DEPRECATED = 1004,
128
+ CUDNN_STATUS_LICENSE_ERROR = 1005,
129
+ CUDNN_STATUS_RUNTIME_IN_PROGRESS = 1006,
130
+ CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 1007,
131
+ CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED = 1008,
132
+
133
+ CUDNN_STATUS_BAD_PARAM = 2000,
134
+ CUDNN_STATUS_BAD_PARAM_NULL_POINTER = 2002,
135
+ CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER = 2003,
136
+ CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED = 2004,
137
+ CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND = 2005,
138
+ CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT = 2006,
139
+ CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH = 2007,
140
+ CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH = 2008,
141
+ CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009,
142
+ CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE = 2010,
143
+ CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011,
144
+ CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE = 2012,
145
+
146
+ CUDNN_STATUS_NOT_SUPPORTED = 3000,
147
+ CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN = 3001,
148
+ CUDNN_STATUS_NOT_SUPPORTED_SHAPE = 3002,
149
+ CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE = 3003,
150
+ CUDNN_STATUS_NOT_SUPPORTED_LAYOUT = 3004,
151
+ CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER = 3005,
152
+ CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART = 3006,
153
+ CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH = 3007,
154
+ CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
155
+ CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE = 3009,
156
+ CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT = 3010,
157
+ CUDNN_STATUS_NOT_SUPPORTED_PADDING = 3011,
158
+ CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM = 3012,
159
+ CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API = 3013,
160
+
161
+ CUDNN_STATUS_INTERNAL_ERROR = 4000,
162
+ CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED = 4001,
163
+ CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE = 4002,
164
+ CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED = 4003,
165
+ CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
166
+ CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM = 4005,
167
+ CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED = 4006,
168
+
169
+ CUDNN_STATUS_EXECUTION_FAILED = 5000,
170
+ CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
171
+ CUDNN_STATUS_EXECUTION_FAILED_CUBLAS = 5002,
172
+ CUDNN_STATUS_EXECUTION_FAILED_CUDART = 5003,
173
+ CUDNN_STATUS_EXECUTION_FAILED_CURAND = 5004,
174
+
175
+ CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
176
+ CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
177
+ CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
178
+ CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
179
+ CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
180
+ CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
181
+ CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
182
+ } cudnnStatus_t;
183
+
184
+ #define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
185
+ #define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
186
+ #define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
187
+
188
+ /* human-readable error messages */
189
+ const char *CUDNNWINAPI
190
+ cudnnGetErrorString(cudnnStatus_t status);
191
+
192
+ void CUDNNWINAPI
193
+ cudnnGetLastErrorString(char *message, size_t max_size);
194
+
195
+ /* Forward definition in this version only */
196
+ typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
197
+
198
+ typedef enum {
199
+ CUDNN_ERRQUERY_RAWCODE = 0,
200
+ CUDNN_ERRQUERY_NONBLOCKING = 1,
201
+ CUDNN_ERRQUERY_BLOCKING = 2,
202
+ } cudnnErrQueryMode_t;
203
+
204
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
205
+ cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
206
+
207
+ cudnnStatus_t CUDNNWINAPI
208
+ cudnnGetProperty(libraryPropertyType type, int *value);
209
+
210
+ cudnnStatus_t CUDNNWINAPI
211
+ cudnnCreate(cudnnHandle_t *handle);
212
+ cudnnStatus_t CUDNNWINAPI
213
+ cudnnDestroy(cudnnHandle_t handle);
214
+ cudnnStatus_t CUDNNWINAPI
215
+ cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
216
+ cudnnStatus_t CUDNNWINAPI
217
+ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
218
+ /*
219
+ * CUDNN data type
220
+ */
221
+ typedef enum {
222
+ CUDNN_DATA_FLOAT = 0,
223
+ CUDNN_DATA_DOUBLE = 1,
224
+ CUDNN_DATA_HALF = 2,
225
+ CUDNN_DATA_INT8 = 3,
226
+ CUDNN_DATA_INT32 = 4,
227
+ CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM = 5,
228
+ CUDNN_DATA_UINT8 = 6,
229
+ CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
230
+ CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
231
+ CUDNN_DATA_BFLOAT16 = 9,
232
+ CUDNN_DATA_INT64 = 10,
233
+ CUDNN_DATA_BOOLEAN = 11,
234
+ CUDNN_DATA_FP8_E4M3 = 12,
235
+ CUDNN_DATA_FP8_E5M2 = 13,
236
+ CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14,
237
+ CUDNN_DATA_FP8_E8M0 = 15,
238
+ CUDNN_DATA_FP4_E2M1 = 16,
239
+ } cudnnDataType_t;
240
+
241
+ /*
242
+ * CUDNN math type
243
+ */
244
+ typedef enum {
245
+ CUDNN_DEFAULT_MATH = 0,
246
+ CUDNN_TENSOR_OP_MATH = 1,
247
+ CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
248
+ CUDNN_FMA_MATH = 3,
249
+ } cudnnMathType_t;
250
+
251
+ /*
252
+ * CUDNN propagate Nan
253
+ */
254
+ typedef enum {
255
+ CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
256
+ CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 1,
257
+ } cudnnNanPropagation_t;
258
+
259
+ /*
260
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
261
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
262
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
263
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
264
+ */
265
+ typedef enum {
266
+ CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
267
+ CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
268
+ } cudnnCTCGradMode_t;
269
+
270
+ typedef enum {
271
+ CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */
272
+ CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/
273
+ CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
274
+ } cudnnTensorFormat_t;
275
+
276
+ /*
277
+ * CUDNN ReduceTensor op type
278
+ */
279
+ typedef enum {
280
+ CUDNN_REDUCE_TENSOR_ADD = 0,
281
+ CUDNN_REDUCE_TENSOR_MUL = 1,
282
+ CUDNN_REDUCE_TENSOR_MIN = 2,
283
+ CUDNN_REDUCE_TENSOR_MAX = 3,
284
+ CUDNN_REDUCE_TENSOR_AMAX = 4,
285
+ CUDNN_REDUCE_TENSOR_AVG = 5,
286
+ CUDNN_REDUCE_TENSOR_NORM1 = 6,
287
+ CUDNN_REDUCE_TENSOR_NORM2 = 7,
288
+ CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
289
+ } cudnnReduceTensorOp_t;
290
+
291
+ /*
292
+ * activation mode
293
+ */
294
+ typedef enum {
295
+ CUDNN_ACTIVATION_SIGMOID = 0,
296
+ CUDNN_ACTIVATION_RELU = 1,
297
+ CUDNN_ACTIVATION_TANH = 2,
298
+ CUDNN_ACTIVATION_CLIPPED_RELU = 3,
299
+ CUDNN_ACTIVATION_ELU = 4,
300
+ CUDNN_ACTIVATION_IDENTITY = 5,
301
+ CUDNN_ACTIVATION_SWISH = 6
302
+ } cudnnActivationMode_t CUDNN_DEPRECATED;
303
+
304
+ typedef enum {
305
+ CUDNN_SEV_FATAL = 0,
306
+ CUDNN_SEV_ERROR = 1,
307
+ CUDNN_SEV_WARNING = 2,
308
+ CUDNN_SEV_INFO = 3,
309
+ } cudnnSeverity_t;
310
+
311
+ /* Message masks to be used with cudnnSetCallback() */
312
+ #define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
313
+ #define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
314
+ #define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
315
+
316
+ /* struct containing useful informaiton for each API call */
317
+ typedef struct cudnnDebugStruct {
318
+ unsigned cudnn_version;
319
+ cudnnStatus_t cudnnStatus;
320
+ unsigned time_sec; /* epoch time in seconds */
321
+ unsigned time_usec; /* microseconds part of epoch time */
322
+ unsigned time_delta; /* time since start in seconds */
323
+ cudnnHandle_t handle; /* cudnn handle */
324
+ cudaStream_t stream; /* cuda stream ID */
325
+ unsigned long long pid; /* process ID */
326
+ unsigned long long tid; /* thread ID */
327
+ int cudaDeviceId; /* CUDA device ID */
328
+ int reserved[15]; /* reserved for future use */
329
+ } cudnnDebug_t;
330
+
331
+ typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
332
+
333
+ cudnnStatus_t CUDNNWINAPI
334
+ cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
335
+
336
+ cudnnStatus_t CUDNNWINAPI
337
+ cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
338
+
339
+ /*
340
+ * \brief Cross-library version checker.
341
+ * This function is implemented differently in each sub-library. Each sublib
342
+ * checks whether its own version matches that of its dependencies.
343
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
344
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
345
+ */
346
+ cudnnStatus_t CUDNNWINAPI
347
+ cudnnGraphVersionCheck(void);
348
+
349
+ /* Maximum supported number of tensor dimensions */
350
+ #define CUDNN_DIM_MAX 8
351
+
352
+ /*
353
+ * convolution mode
354
+ */
355
+ typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
356
+
357
+ /*
358
+ * CUDNN Reorder
359
+ */
360
+ typedef enum {
361
+ CUDNN_DEFAULT_REORDER = 0,
362
+ CUDNN_NO_REORDER = 1,
363
+ } cudnnReorderType_t CUDNN_DEPRECATED;
364
+
365
+ typedef void *cudnnBackendDescriptor_t;
366
+
367
+ typedef struct cudnnFractionStruct {
368
+ int64_t numerator;
369
+ int64_t denominator;
370
+ } cudnnFraction_t;
371
+
372
+ typedef enum {
373
+ CUDNN_POINTWISE_ADD = 0,
374
+ CUDNN_POINTWISE_ADD_SQUARE = 5,
375
+ CUDNN_POINTWISE_DIV = 6,
376
+ CUDNN_POINTWISE_MAX = 3,
377
+ CUDNN_POINTWISE_MIN = 2,
378
+ CUDNN_POINTWISE_MOD = 7,
379
+ CUDNN_POINTWISE_MUL = 1,
380
+ CUDNN_POINTWISE_POW = 8,
381
+ CUDNN_POINTWISE_SUB = 9,
382
+
383
+ CUDNN_POINTWISE_ABS = 10,
384
+ CUDNN_POINTWISE_CEIL = 11,
385
+ CUDNN_POINTWISE_COS = 12,
386
+ CUDNN_POINTWISE_EXP = 13,
387
+ CUDNN_POINTWISE_FLOOR = 14,
388
+ CUDNN_POINTWISE_LOG = 15,
389
+ CUDNN_POINTWISE_NEG = 16,
390
+ CUDNN_POINTWISE_RSQRT = 17,
391
+ CUDNN_POINTWISE_SIN = 18,
392
+ CUDNN_POINTWISE_SQRT = 4,
393
+ CUDNN_POINTWISE_TAN = 19,
394
+ CUDNN_POINTWISE_ERF = 20,
395
+ CUDNN_POINTWISE_IDENTITY = 21,
396
+ CUDNN_POINTWISE_RECIPROCAL = 22,
397
+ CUDNN_POINTWISE_ATAN2 = 23,
398
+
399
+ CUDNN_POINTWISE_RELU_FWD = 100,
400
+ CUDNN_POINTWISE_TANH_FWD = 101,
401
+ CUDNN_POINTWISE_SIGMOID_FWD = 102,
402
+ CUDNN_POINTWISE_ELU_FWD = 103,
403
+ CUDNN_POINTWISE_GELU_FWD = 104,
404
+ CUDNN_POINTWISE_SOFTPLUS_FWD = 105,
405
+ CUDNN_POINTWISE_SWISH_FWD = 106,
406
+ CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
407
+
408
+ CUDNN_POINTWISE_RELU_BWD = 200,
409
+ CUDNN_POINTWISE_TANH_BWD = 201,
410
+ CUDNN_POINTWISE_SIGMOID_BWD = 202,
411
+ CUDNN_POINTWISE_ELU_BWD = 203,
412
+ CUDNN_POINTWISE_GELU_BWD = 204,
413
+ CUDNN_POINTWISE_SOFTPLUS_BWD = 205,
414
+ CUDNN_POINTWISE_SWISH_BWD = 206,
415
+ CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
416
+
417
+ CUDNN_POINTWISE_CMP_EQ = 300,
418
+ CUDNN_POINTWISE_CMP_NEQ = 301,
419
+ CUDNN_POINTWISE_CMP_GT = 302,
420
+ CUDNN_POINTWISE_CMP_GE = 303,
421
+ CUDNN_POINTWISE_CMP_LT = 304,
422
+ CUDNN_POINTWISE_CMP_LE = 305,
423
+
424
+ CUDNN_POINTWISE_LOGICAL_AND = 400,
425
+ CUDNN_POINTWISE_LOGICAL_OR = 401,
426
+ CUDNN_POINTWISE_LOGICAL_NOT = 402,
427
+
428
+ CUDNN_POINTWISE_GEN_INDEX = 501,
429
+
430
+ CUDNN_POINTWISE_BINARY_SELECT = 601,
431
+ } cudnnPointwiseMode_t;
432
+
433
+ typedef enum {
434
+ CUDNN_RESAMPLE_NEAREST = 0,
435
+ CUDNN_RESAMPLE_BILINEAR = 1,
436
+ CUDNN_RESAMPLE_AVGPOOL = 2,
437
+ CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
438
+ CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
439
+ CUDNN_RESAMPLE_MAXPOOL = 3,
440
+ } cudnnResampleMode_t;
441
+
442
+ typedef enum {
443
+ CUDNN_SIGNAL_SET = 0,
444
+ CUDNN_SIGNAL_WAIT = 1,
445
+ } cudnnSignalMode_t;
446
+
447
+ typedef enum {
448
+ CUDNN_GENSTATS_SUM_SQSUM = 0,
449
+ } cudnnGenStatsMode_t;
450
+
451
+ typedef enum {
452
+ CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0,
453
+ CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
454
+ } cudnnBnFinalizeStatsMode_t;
455
+
456
+ typedef enum {
457
+ CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0,
458
+ CUDNN_RNG_DISTRIBUTION_UNIFORM = 1,
459
+ CUDNN_RNG_DISTRIBUTION_NORMAL = 2,
460
+ } cudnnRngDistribution_t;
461
+
462
+ typedef enum {
463
+ CUDNN_ATTR_POINTWISE_MODE = 0,
464
+ CUDNN_ATTR_POINTWISE_MATH_PREC = 1,
465
+ CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
466
+ CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3,
467
+ CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4,
468
+ CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
469
+ CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6,
470
+ CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7,
471
+ CUDNN_ATTR_POINTWISE_SWISH_BETA = 8,
472
+ CUDNN_ATTR_POINTWISE_AXIS = 9,
473
+
474
+ CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100,
475
+ CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101,
476
+ CUDNN_ATTR_CONVOLUTION_DILATIONS = 102,
477
+ CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
478
+ CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104,
479
+ CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105,
480
+ CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106,
481
+
482
+ CUDNN_ATTR_ENGINEHEUR_MODE = 200,
483
+ CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
484
+ CUDNN_ATTR_ENGINEHEUR_RESULTS = 202,
485
+ CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
486
+ CUDNN_ATTR_ENGINEHEUR_DEVICEPROP = 204,
487
+
488
+ CUDNN_ATTR_ENGINECFG_ENGINE = 300,
489
+ CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
490
+ CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302,
491
+ CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE = 303,
492
+ CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304,
493
+
494
+ CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400,
495
+ CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401,
496
+ CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402,
497
+ CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
498
+ CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
499
+ CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405,
500
+ CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE = 406,
501
+ CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP = 407,
502
+
503
+ CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500,
504
+ CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501,
505
+ CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502,
506
+ CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
507
+
508
+ CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600,
509
+ CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
510
+
511
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700,
512
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701,
513
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702,
514
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703,
515
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704,
516
+ CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705,
517
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706,
518
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707,
519
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708,
520
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709,
521
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710,
522
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711,
523
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712,
524
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713,
525
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
526
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715,
527
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716,
528
+ CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717,
529
+
530
+ CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
531
+ CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751,
532
+ CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752,
533
+ CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753,
534
+ CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754,
535
+ CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755,
536
+ CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756,
537
+ CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757,
538
+ CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758,
539
+
540
+ CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770,
541
+ CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
542
+ CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772,
543
+ CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773,
544
+ CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
545
+
546
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780,
547
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781,
548
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782,
549
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783,
550
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784,
551
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785,
552
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786,
553
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787,
554
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
555
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789,
556
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790,
557
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791,
558
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792,
559
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793,
560
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794,
561
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795,
562
+ CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796,
563
+
564
+ CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800,
565
+ CUDNN_ATTR_OPERATIONGRAPH_OPS = 801,
566
+ CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
567
+ CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803,
568
+ CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY = 804,
569
+
570
+ CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900,
571
+ CUDNN_ATTR_TENSOR_DATA_TYPE = 901,
572
+ CUDNN_ATTR_TENSOR_DIMENSIONS = 902,
573
+ CUDNN_ATTR_TENSOR_STRIDES = 903,
574
+ CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904,
575
+ CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
576
+ CUDNN_ATTR_TENSOR_UNIQUE_ID = 906,
577
+ CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907,
578
+ CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908,
579
+ CUDNN_ATTR_TENSOR_REORDERING_MODE = 909,
580
+ CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC = 913,
581
+
582
+ CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000,
583
+ CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
584
+ CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
585
+ CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003,
586
+
587
+ CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
588
+ CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101,
589
+
590
+ CUDNN_ATTR_KNOB_INFO_TYPE = 1200,
591
+ CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
592
+ CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
593
+ CUDNN_ATTR_KNOB_INFO_STRIDE = 1203,
594
+
595
+ CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
596
+ CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301,
597
+ CUDNN_ATTR_ENGINE_KNOB_INFO = 1302,
598
+ CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303,
599
+ CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304,
600
+ CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305,
601
+ CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
602
+ CUDNN_ATTR_ENGINE_DEVICEPROP = 1307,
603
+
604
+ CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
605
+ CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
606
+
607
+ CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520,
608
+ CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521,
609
+ CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522,
610
+ CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523,
611
+ CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
612
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525,
613
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526,
614
+ CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527,
615
+
616
+ CUDNN_ATTR_REDUCTION_OPERATOR = 1600,
617
+ CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
618
+
619
+ CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
620
+ CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
621
+ CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612,
622
+
623
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620,
624
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621,
625
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622,
626
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623,
627
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624,
628
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625,
629
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626,
630
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627,
631
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
632
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629,
633
+ CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630,
634
+
635
+ CUDNN_ATTR_RESAMPLE_MODE = 1700,
636
+ CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701,
637
+ CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702,
638
+ CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703,
639
+ CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704,
640
+ CUDNN_ATTR_RESAMPLE_STRIDES = 1705,
641
+ CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706,
642
+ CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
643
+ CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708,
644
+
645
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710,
646
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711,
647
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
648
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
649
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM = 1714,
650
+ CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716,
651
+
652
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720,
653
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721,
654
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
655
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
656
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM = 1724,
657
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725,
658
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726,
659
+ CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727,
660
+
661
+ CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800,
662
+ CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801,
663
+ CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
664
+ CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803,
665
+
666
+ CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900,
667
+ CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
668
+ CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902,
669
+ CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903,
670
+ CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904,
671
+
672
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC = 1950,
673
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC = 1951,
674
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC = 1952,
675
+ CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,
676
+
677
+ CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000,
678
+ CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001,
679
+ CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002,
680
+ CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003,
681
+ CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004,
682
+ CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005,
683
+ CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006,
684
+ CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007,
685
+ CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008,
686
+ CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009,
687
+ CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010,
688
+ CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
689
+ CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012,
690
+ CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013,
691
+ CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014,
692
+
693
+ CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100,
694
+ CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101,
695
+ CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102,
696
+ CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
697
+ CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104,
698
+ CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105,
699
+ CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106,
700
+ CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107,
701
+ CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108,
702
+ CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109,
703
+ CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110,
704
+
705
+ CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
706
+ CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
707
+
708
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC = 2250,
709
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC = 2251,
710
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH = 2252,
711
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH = 2253,
712
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS = 2254,
713
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE = 2255,
714
+ CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256,
715
+
716
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC = 2270,
717
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC = 2271,
718
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272,
719
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273,
720
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS = 2274,
721
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE = 2275,
722
+ CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE = 2276,
723
+
724
+ CUDNN_ATTR_RNG_DISTRIBUTION = 2300,
725
+ CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301,
726
+ CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
727
+ CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303,
728
+ CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304,
729
+ CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305,
730
+
731
+ CUDNN_ATTR_OPERATION_RNG_YDESC = 2310,
732
+ CUDNN_ATTR_OPERATION_RNG_SEED = 2311,
733
+ CUDNN_ATTR_OPERATION_RNG_DESC = 2312,
734
+ CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
735
+
736
+ CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH = 2400,
737
+ CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401,
738
+ CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION = 2402,
739
+
740
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500,
741
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501,
742
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
743
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503,
744
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,
745
+
746
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600,
747
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
748
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602,
749
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603,
750
+ CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,
751
+
752
+ CUDNN_ATTR_DEVICEPROP_DEVICE_ID = 2700,
753
+ CUDNN_ATTR_DEVICEPROP_HANDLE = 2701,
754
+ CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,
755
+ } cudnnBackendAttributeName_t;
756
+
757
+ typedef enum {
758
+ CUDNN_TYPE_HANDLE = 0,
759
+ CUDNN_TYPE_DATA_TYPE = 1,
760
+ CUDNN_TYPE_BOOLEAN = 2,
761
+ CUDNN_TYPE_INT64 = 3,
762
+ CUDNN_TYPE_FLOAT = 4,
763
+ CUDNN_TYPE_DOUBLE = 5,
764
+ CUDNN_TYPE_VOID_PTR = 6,
765
+ CUDNN_TYPE_CONVOLUTION_MODE = 7,
766
+ CUDNN_TYPE_HEUR_MODE = 8,
767
+ CUDNN_TYPE_KNOB_TYPE = 9,
768
+ CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10,
769
+ CUDNN_TYPE_NUMERICAL_NOTE = 11,
770
+ CUDNN_TYPE_LAYOUT_TYPE = 12,
771
+ CUDNN_TYPE_ATTRIB_NAME = 13,
772
+ CUDNN_TYPE_POINTWISE_MODE = 14,
773
+ CUDNN_TYPE_BACKEND_DESCRIPTOR = 15,
774
+ CUDNN_TYPE_GENSTATS_MODE = 16,
775
+ CUDNN_TYPE_BN_FINALIZE_STATS_MODE = 17,
776
+ CUDNN_TYPE_REDUCTION_OPERATOR_TYPE = 18,
777
+ CUDNN_TYPE_BEHAVIOR_NOTE = 19,
778
+ CUDNN_TYPE_TENSOR_REORDERING_MODE = 20,
779
+ CUDNN_TYPE_RESAMPLE_MODE = 21,
780
+ CUDNN_TYPE_PADDING_MODE = 22,
781
+ CUDNN_TYPE_INT32 = 23,
782
+ CUDNN_TYPE_CHAR = 24,
783
+ CUDNN_TYPE_SIGNAL_MODE = 25,
784
+ CUDNN_TYPE_FRACTION = 26,
785
+ CUDNN_TYPE_NORM_MODE = 27,
786
+ CUDNN_TYPE_NORM_FWD_PHASE = 28,
787
+ CUDNN_TYPE_RNG_DISTRIBUTION = 29,
788
+ } cudnnBackendAttributeType_t;
789
+
790
+ typedef enum {
791
+ CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
792
+ CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR = 1,
793
+ CUDNN_BACKEND_ENGINE_DESCRIPTOR = 2,
794
+ CUDNN_BACKEND_ENGINECFG_DESCRIPTOR = 3,
795
+ CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR = 4,
796
+ CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR = 5,
797
+ CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR = 6,
798
+ CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR = 7,
799
+ CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR = 8,
800
+ CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR = 9,
801
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR = 10,
802
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
803
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR = 12,
804
+ CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR = 13,
805
+ CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR = 14,
806
+ CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR = 15,
807
+ CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR = 16,
808
+ CUDNN_BACKEND_TENSOR_DESCRIPTOR = 17,
809
+ CUDNN_BACKEND_MATMUL_DESCRIPTOR = 18,
810
+ CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR = 19,
811
+ CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR = 20,
812
+ CUDNN_BACKEND_REDUCTION_DESCRIPTOR = 21,
813
+ CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR = 22,
814
+ CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR = 23,
815
+ CUDNN_BACKEND_RESAMPLE_DESCRIPTOR = 24,
816
+ CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR = 25,
817
+ CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR = 26,
818
+ CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR = 27,
819
+ CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR = 28,
820
+ CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR = 29,
821
+ CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR = 30,
822
+ CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR = 31,
823
+ CUDNN_BACKEND_RNG_DESCRIPTOR = 32,
824
+ CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR = 33,
825
+ CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR = 34,
826
+ CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR = 35,
827
+ CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR = 36,
828
+ CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR = 37,
829
+ CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR = 38,
830
+ CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR = 39,
831
+ CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR = 40,
832
+ } cudnnBackendDescriptorType_t;
833
+
834
+ typedef enum {
835
+ CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
836
+ CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS = 1,
837
+ CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
838
+ CUDNN_NUMERICAL_NOTE_FFT = 3,
839
+ CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC = 4,
840
+ CUDNN_NUMERICAL_NOTE_WINOGRAD = 5,
841
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4 = 6,
842
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6 = 7,
843
+ CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13 = 8,
844
+ CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP = 9,
845
+ CUDNN_NUMERICAL_NOTE_TYPE_COUNT = 10,
846
+ } cudnnBackendNumericalNote_t;
847
+
848
+ typedef enum {
849
+ CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0,
850
+ CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
851
+ CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2,
852
+ CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3,
853
+ CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 4,
854
+ } cudnnBackendBehaviorNote_t;
855
+
856
+ typedef enum {
857
+ CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM = 0,
858
+ CUDNN_KNOB_TYPE_SWIZZLE = 1,
859
+ CUDNN_KNOB_TYPE_TILE_SIZE = 2,
860
+ CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM = 3,
861
+ CUDNN_KNOB_TYPE_EDGE = 4,
862
+ CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM = 5,
863
+ CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM = 6,
864
+ CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM = 7,
865
+ CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM = 8,
866
+ CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM = 9,
867
+ CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM = 10,
868
+ CUDNN_KNOB_TYPE_MULTIPLY = 11,
869
+ CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12,
870
+ CUDNN_KNOB_TYPE_TILEK = 13,
871
+ CUDNN_KNOB_TYPE_STAGES = 14,
872
+ CUDNN_KNOB_TYPE_REDUCTION_MODE = 15,
873
+ CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
874
+ CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17,
875
+ CUDNN_KNOB_TYPE_IDX_MODE = 18,
876
+ CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM = 19,
877
+ CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM = 20,
878
+ CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM = 21,
879
+ CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM = 22,
880
+ CUDNN_KNOB_TYPE_SPECFILT = 23,
881
+ CUDNN_KNOB_TYPE_KERNEL_CFG = 24,
882
+ CUDNN_KNOB_TYPE_WORKSPACE = 25,
883
+ CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM = 26,
884
+ CUDNN_KNOB_TYPE_TILE_CGA_M = 27,
885
+ CUDNN_KNOB_TYPE_TILE_CGA_N = 28,
886
+ CUDNN_KNOB_TYPE_BLOCK_SIZE = 29,
887
+ CUDNN_KNOB_TYPE_OCCUPANCY = 30,
888
+ CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD = 31,
889
+ CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM = 32,
890
+ CUDNN_KNOB_TYPE_SPLIT_COLS = 33,
891
+ CUDNN_KNOB_TYPE_TILE_ROWS = 34,
892
+ CUDNN_KNOB_TYPE_TILE_COLS = 35,
893
+ CUDNN_KNOB_TYPE_LOAD_SIZE = 36,
894
+ CUDNN_KNOB_TYPE_CTA_COUNT = 37,
895
+ CUDNN_KNOB_TYPE_STREAM_K = 38,
896
+ CUDNN_KNOB_TYPE_SPLIT_P_SLC = 39,
897
+ CUDNN_KNOB_TYPE_TILE_M = 40,
898
+ CUDNN_KNOB_TYPE_TILE_N = 41,
899
+ CUDNN_KNOB_TYPE_WARP_SPEC_CFG = 42,
900
+ CUDNN_KNOB_TYPE_COUNTS = 43,
901
+ } cudnnBackendKnobType_t;
902
+
903
+ typedef enum {
904
+ CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0,
905
+ CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1,
906
+ CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
907
+ CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
908
+ CUDNN_LAYOUT_TYPE_COUNT = 4,
909
+ } cudnnBackendLayoutType_t;
910
+
911
+ typedef enum {
912
+ CUDNN_HEUR_MODE_INSTANT = 0,
913
+ CUDNN_HEUR_MODE_B = 1,
914
+ CUDNN_HEUR_MODE_FALLBACK = 2,
915
+ CUDNN_HEUR_MODE_A = 3,
916
+ CUDNN_HEUR_MODES_COUNT = 4,
917
+ } cudnnBackendHeurMode_t;
918
+
919
+ typedef enum {
920
+ CUDNN_TENSOR_REORDERING_NONE = 0,
921
+ CUDNN_TENSOR_REORDERING_INT8x32 = 1,
922
+ CUDNN_TENSOR_REORDERING_F16x16 = 2,
923
+ CUDNN_TENSOR_REORDERING_F8_128x4 = 3,
924
+ } cudnnBackendTensorReordering_t;
925
+
926
+ typedef enum {
927
+ CUDNN_ZERO_PAD = 0,
928
+ CUDNN_NEG_INF_PAD = 1,
929
+ CUDNN_EDGE_VAL_PAD = 2,
930
+ } cudnnPaddingMode_t;
931
+
932
+ typedef enum {
933
+ CUDNN_LAYER_NORM = 0,
934
+ CUDNN_INSTANCE_NORM = 1,
935
+ CUDNN_BATCH_NORM = 2,
936
+ CUDNN_GROUP_NORM = 3,
937
+ CUDNN_RMS_NORM = 4,
938
+ CUDNN_ADA_LAYER_NORM = 5,
939
+ } cudnnBackendNormMode_t;
940
+
941
+ typedef enum {
942
+ CUDNN_NORM_FWD_INFERENCE = 0,
943
+ CUDNN_NORM_FWD_TRAINING = 1,
944
+ } cudnnBackendNormFwdPhase_t;
945
+
946
+ cudnnStatus_t CUDNNWINAPI
947
+ cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
948
+
949
+ cudnnStatus_t CUDNNWINAPI
950
+ cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
951
+
952
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
953
+ cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
954
+
955
+ cudnnStatus_t CUDNNWINAPI
956
+ cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
957
+
958
+ cudnnStatus_t CUDNNWINAPI
959
+ cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
960
+ cudnnBackendAttributeName_t attributeName,
961
+ cudnnBackendAttributeType_t attributeType,
962
+ int64_t elementCount,
963
+ const void *arrayOfElements);
964
+
965
+ cudnnStatus_t CUDNNWINAPI
966
+ cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
967
+ cudnnBackendAttributeName_t attributeName,
968
+ cudnnBackendAttributeType_t attributeType,
969
+ int64_t requestedElementCount,
970
+ int64_t *elementCount,
971
+ void *arrayOfElements);
972
+
973
+ cudnnStatus_t CUDNNWINAPI
974
+ cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
975
+
976
+ cudnnStatus_t CUDNNWINAPI
977
+ cudnnBackendPopulateCudaGraph(cudnnHandle_t handle,
978
+ cudnnBackendDescriptor_t executionPlan,
979
+ cudnnBackendDescriptor_t variantPack,
980
+ cudaGraph_t graph);
981
+
982
+ cudnnStatus_t CUDNNWINAPI
983
+ cudnnBackendUpdateCudaGraph(cudnnHandle_t handle,
984
+ cudnnBackendDescriptor_t executionPlan,
985
+ cudnnBackendDescriptor_t variantPack,
986
+ cudaGraph_t graph);
987
+
988
+ #if defined(__cplusplus)
989
+ }
990
+ #endif
991
+
992
+ #endif /* CUDNN_GRAPH_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h ADDED
@@ -0,0 +1,1316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_ops : cuDNN's basic definitions and basic operations.
52
+ */
53
+
54
+ #if !defined(CUDNN_OPS_H_)
55
+ #define CUDNN_OPS_H_
56
+
57
+ #include <stdint.h>
58
+
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_graph.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_OPS_MAJOR 9
64
+ #define CUDNN_OPS_MINOR 10
65
+ #define CUDNN_OPS_PATCH 2
66
+
67
+ #if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
68
+ #error Version mismatch in cuDNN OPS INFER!!!
69
+ #endif
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif
74
+
75
+ /* Data structures to represent Image/Filter and the Neural Network Layer */
76
+ typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
77
+ typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
78
+ typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
79
+ typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
80
+ typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
81
+ typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
82
+ typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
83
+ typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
84
+ typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
85
+ typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
86
+ /*
87
+ * CUDNN Determinism
88
+ */
89
+ typedef enum {
90
+ CUDNN_NON_DETERMINISTIC = 0,
91
+ CUDNN_DETERMINISTIC = 1,
92
+ } cudnnDeterminism_t;
93
+
94
+ /* Create an instance of a generic Tensor descriptor */
95
+ cudnnStatus_t CUDNNWINAPI
96
+ cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
97
+
98
+ cudnnStatus_t CUDNNWINAPI
99
+ cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
100
+ cudnnTensorFormat_t format,
101
+ cudnnDataType_t dataType, /* image data type */
102
+ int n, /* number of inputs (batch size) */
103
+ int c, /* number of input feature maps */
104
+ int h, /* height of input section */
105
+ int w); /* width of input section */
106
+
107
+ cudnnStatus_t CUDNNWINAPI
108
+ cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
109
+ cudnnDataType_t dataType, /* image data type */
110
+ int n, /* number of inputs (batch size) */
111
+ int c, /* number of input feature maps */
112
+ int h, /* height of input section */
113
+ int w, /* width of input section */
114
+ int nStride,
115
+ int cStride,
116
+ int hStride,
117
+ int wStride);
118
+
119
+ cudnnStatus_t CUDNNWINAPI
120
+ cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
121
+ cudnnDataType_t *dataType, /* image data type */
122
+ int *n, /* number of inputs (batch size) */
123
+ int *c, /* number of input feature maps */
124
+ int *h, /* height of input section */
125
+ int *w, /* width of input section */
126
+ int *nStride,
127
+ int *cStride,
128
+ int *hStride,
129
+ int *wStride);
130
+
131
+ cudnnStatus_t CUDNNWINAPI
132
+ cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
133
+ cudnnDataType_t dataType,
134
+ int nbDims,
135
+ const int dimA[],
136
+ const int strideA[]);
137
+
138
+ cudnnStatus_t CUDNNWINAPI
139
+ cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
140
+ cudnnTensorFormat_t format,
141
+ cudnnDataType_t dataType,
142
+ int nbDims,
143
+ const int dimA[]);
144
+
145
+ cudnnStatus_t CUDNNWINAPI
146
+ cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
147
+ int nbDimsRequested,
148
+ cudnnDataType_t *dataType,
149
+ int *nbDims,
150
+ int dimA[],
151
+ int strideA[]);
152
+
153
+ cudnnStatus_t CUDNNWINAPI
154
+ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
155
+
156
+ /* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
157
+
158
+ 1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
159
+ input_stride : c x h x h_stride
160
+ feature_stride : h x h_stride
161
+ h_stride : >= w ( h_stride = w if no padding)
162
+ w_stride : 1
163
+
164
+
165
+ 2)Example of all images in row major with features maps interleaved
166
+ input_stride : c x h x h_stride
167
+ feature_stride : 1
168
+ h_stride : w x c
169
+ w_stride : c
170
+
171
+ 3)Example of all images in column major order one batch of features after the other (with optional padding on column)
172
+ input_stride : c x w x w_stride
173
+ feature_stride : w x w_stride
174
+ h_stride : 1
175
+ w_stride : >= h
176
+
177
+ */
178
+
179
+ /* Destroy an instance of Tensor4d descriptor */
180
+ cudnnStatus_t CUDNNWINAPI
181
+ cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
182
+
183
+ /* Fold/unfold transforms */
184
+ typedef enum {
185
+ CUDNN_TRANSFORM_FOLD = 0U,
186
+ CUDNN_TRANSFORM_UNFOLD = 1U,
187
+ } cudnnFoldingDirection_t;
188
+
189
+ /** Create a destination descriptor for cudnnTransformTensor */
190
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
191
+ cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
192
+ const cudnnTensorDescriptor_t srcDesc,
193
+ cudnnTensorDescriptor_t destDesc,
194
+ size_t *destSizeInBytes);
195
+
196
+ /** Create an empty tensor transform descriptor */
197
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
198
+ cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
199
+
200
+ /** Initialize a previously created tensor transform descriptor. */
201
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
202
+ cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
203
+ const uint32_t nbDims,
204
+ const cudnnTensorFormat_t destFormat,
205
+ const int32_t padBeforeA[],
206
+ const int32_t padAfterA[],
207
+ const uint32_t foldA[],
208
+ const cudnnFoldingDirection_t direction);
209
+
210
+ /**
211
+ * Retrieves the values stored in a previously initialized tensor transform
212
+ * descriptor.
213
+ */
214
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
215
+ cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
216
+ uint32_t nbDimsRequested,
217
+ cudnnTensorFormat_t *destFormat,
218
+ int32_t padBeforeA[],
219
+ int32_t padAfterA[],
220
+ uint32_t foldA[],
221
+ cudnnFoldingDirection_t *direction);
222
+
223
+ /**
224
+ * Destroys a previously created tensor transform descriptor.
225
+ */
226
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
227
+ cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
228
+
229
+ /* Tensor layout conversion helper (y = alpha * x + beta * y) */
230
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
231
+ cudnnTransformTensor(cudnnHandle_t handle,
232
+ const void *alpha,
233
+ const cudnnTensorDescriptor_t xDesc,
234
+ const void *x,
235
+ const void *beta,
236
+ const cudnnTensorDescriptor_t yDesc,
237
+ void *y);
238
+
239
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
240
+ cudnnTransformTensorEx(cudnnHandle_t handle,
241
+ const cudnnTensorTransformDescriptor_t transDesc,
242
+ const void *alpha,
243
+ const cudnnTensorDescriptor_t srcDesc,
244
+ const void *srcData,
245
+ const void *beta,
246
+ const cudnnTensorDescriptor_t destDesc,
247
+ void *destData);
248
+
249
+ /* Tensor Bias addition : C = alpha * A + beta * C */
250
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
251
+ cudnnAddTensor(cudnnHandle_t handle,
252
+ const void *alpha,
253
+ const cudnnTensorDescriptor_t aDesc,
254
+ const void *A,
255
+ const void *beta,
256
+ const cudnnTensorDescriptor_t cDesc,
257
+ void *C);
258
+
259
+ /*
260
+ * CUDNN OpTensor op type
261
+ */
262
+ typedef enum {
263
+ CUDNN_OP_TENSOR_ADD = 0,
264
+ CUDNN_OP_TENSOR_MUL = 1,
265
+ CUDNN_OP_TENSOR_MIN = 2,
266
+ CUDNN_OP_TENSOR_MAX = 3,
267
+ CUDNN_OP_TENSOR_SQRT = 4,
268
+ CUDNN_OP_TENSOR_NOT = 5,
269
+ } cudnnOpTensorOp_t;
270
+
271
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
272
+ cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
273
+
274
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
275
+ cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
276
+ cudnnOpTensorOp_t opTensorOp,
277
+ cudnnDataType_t opTensorCompType,
278
+ cudnnNanPropagation_t opTensorNanOpt);
279
+
280
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
281
+ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
282
+ cudnnOpTensorOp_t *opTensorOp,
283
+ cudnnDataType_t *opTensorCompType,
284
+ cudnnNanPropagation_t *opTensorNanOpt);
285
+
286
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
287
+ cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
288
+
289
+ /* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
290
+ /* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnOpTensor(cudnnHandle_t handle,
293
+ const cudnnOpTensorDescriptor_t opTensorDesc,
294
+ const void *alpha1,
295
+ const cudnnTensorDescriptor_t aDesc,
296
+ const void *A,
297
+ const void *alpha2,
298
+ const cudnnTensorDescriptor_t bDesc,
299
+ const void *B,
300
+ const void *beta,
301
+ const cudnnTensorDescriptor_t cDesc,
302
+ void *C);
303
+
304
+ /*
305
+ * CUDNN ReduceTensor indices type
306
+ */
307
+ typedef enum {
308
+ CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
309
+ CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
310
+ } cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
311
+
312
+ /*
313
+ * CUDNN tensor indices type size (all unsigned)
314
+ * Currently not supported, default is 32 bit unsigned.
315
+ */
316
+ typedef enum {
317
+ CUDNN_32BIT_INDICES = 0,
318
+ CUDNN_64BIT_INDICES = 1,
319
+ CUDNN_16BIT_INDICES = 2,
320
+ CUDNN_8BIT_INDICES = 3,
321
+ } cudnnIndicesType_t CUDNN_DEPRECATED;
322
+
323
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
324
+ cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
325
+
326
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
327
+ cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
328
+ cudnnReduceTensorOp_t reduceTensorOp,
329
+ cudnnDataType_t reduceTensorCompType,
330
+ cudnnNanPropagation_t reduceTensorNanOpt,
331
+ cudnnReduceTensorIndices_t reduceTensorIndices,
332
+ cudnnIndicesType_t reduceTensorIndicesType);
333
+
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
336
+ cudnnReduceTensorOp_t *reduceTensorOp,
337
+ cudnnDataType_t *reduceTensorCompType,
338
+ cudnnNanPropagation_t *reduceTensorNanOpt,
339
+ cudnnReduceTensorIndices_t *reduceTensorIndices,
340
+ cudnnIndicesType_t *reduceTensorIndicesType);
341
+
342
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
343
+ cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
344
+
345
+ /* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
346
+ * output tensors */
347
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
348
+ cudnnGetReductionIndicesSize(cudnnHandle_t handle,
349
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
350
+ const cudnnTensorDescriptor_t aDesc,
351
+ const cudnnTensorDescriptor_t cDesc,
352
+ size_t *sizeInBytes);
353
+
354
+ /* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
355
+ * tensors */
356
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
357
+ cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
358
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
359
+ const cudnnTensorDescriptor_t aDesc,
360
+ const cudnnTensorDescriptor_t cDesc,
361
+ size_t *sizeInBytes);
362
+
363
+ /* Tensor operation : C = reduce op( alpha * A ) + beta * C */
364
+ /* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
365
+ /* The indices space is ignored for reduce ops other than min or max. */
366
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
367
+ cudnnReduceTensor(cudnnHandle_t handle,
368
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
369
+ void *indices,
370
+ size_t indicesSizeInBytes,
371
+ void *workspace,
372
+ size_t workspaceSizeInBytes,
373
+ const void *alpha,
374
+ const cudnnTensorDescriptor_t aDesc,
375
+ const void *A,
376
+ const void *beta,
377
+ const cudnnTensorDescriptor_t cDesc,
378
+ void *C);
379
+
380
+ /* Set all values of a tensor to a given value : y[i] = value[0] */
381
+ cudnnStatus_t CUDNNWINAPI
382
+ cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
383
+
384
+ /* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
385
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
386
+ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
387
+
388
+ /* Create an instance of FilterStruct */
389
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
390
+ cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
391
+
392
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
393
+ cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
394
+ cudnnDataType_t dataType, /* image data type */
395
+ cudnnTensorFormat_t format,
396
+ int k, /* number of output feature maps */
397
+ int c, /* number of input feature maps */
398
+ int h, /* height of each input filter */
399
+ int w); /* width of each input filter */
400
+
401
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
402
+ cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
403
+ cudnnDataType_t *dataType, /* image data type */
404
+ cudnnTensorFormat_t *format,
405
+ int *k, /* number of output feature maps */
406
+ int *c, /* number of input feature maps */
407
+ int *h, /* height of each input filter */
408
+ int *w); /* width of each input filter */
409
+
410
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
411
+ cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
412
+ cudnnDataType_t dataType, /* image data type */
413
+ cudnnTensorFormat_t format,
414
+ int nbDims,
415
+ const int filterDimA[]);
416
+
417
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
418
+ cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
419
+ int nbDimsRequested,
420
+ cudnnDataType_t *dataType, /* image data type */
421
+ cudnnTensorFormat_t *format,
422
+ int *nbDims,
423
+ int filterDimA[]);
424
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
425
+ cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
426
+
427
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
428
+ cudnnTransformFilter(cudnnHandle_t handle,
429
+ const cudnnTensorTransformDescriptor_t transDesc,
430
+ const void *alpha,
431
+ const cudnnFilterDescriptor_t srcDesc,
432
+ const void *srcData,
433
+ const void *beta,
434
+ const cudnnFilterDescriptor_t destDesc,
435
+ void *destData);
436
+
437
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
438
+ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
439
+
440
+ /*
441
+ * softmax algorithm
442
+ */
443
+ typedef enum {
444
+ CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */
445
+ CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
446
+ CUDNN_SOFTMAX_LOG = 2
447
+ } cudnnSoftmaxAlgorithm_t;
448
+
449
+ typedef enum {
450
+ CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
451
+ CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
452
+ } cudnnSoftmaxMode_t;
453
+
454
+ /* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
455
+
456
+ /* Function to perform forward softmax */
457
+ cudnnStatus_t CUDNNWINAPI
458
+ cudnnSoftmaxForward(cudnnHandle_t handle,
459
+ cudnnSoftmaxAlgorithm_t algo,
460
+ cudnnSoftmaxMode_t mode,
461
+ const void *alpha,
462
+ const cudnnTensorDescriptor_t xDesc,
463
+ const void *x,
464
+ const void *beta,
465
+ const cudnnTensorDescriptor_t yDesc,
466
+ void *y);
467
+
468
+ /*
469
+ * pooling mode
470
+ */
471
+ typedef enum {
472
+ CUDNN_POOLING_MAX = 0,
473
+ CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
474
+ CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
475
+ CUDNN_POOLING_MAX_DETERMINISTIC = 3
476
+ } cudnnPoolingMode_t CUDNN_DEPRECATED;
477
+
478
+ /* Create an instance of pooling descriptor */
479
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
480
+ cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
481
+
482
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
483
+ cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
484
+ cudnnPoolingMode_t mode,
485
+ cudnnNanPropagation_t maxpoolingNanOpt,
486
+ int windowHeight,
487
+ int windowWidth,
488
+ int verticalPadding,
489
+ int horizontalPadding,
490
+ int verticalStride,
491
+ int horizontalStride);
492
+
493
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
494
+ cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
495
+ cudnnPoolingMode_t *mode,
496
+ cudnnNanPropagation_t *maxpoolingNanOpt,
497
+ int *windowHeight,
498
+ int *windowWidth,
499
+ int *verticalPadding,
500
+ int *horizontalPadding,
501
+ int *verticalStride,
502
+ int *horizontalStride);
503
+
504
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
505
+ cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
506
+ const cudnnPoolingMode_t mode,
507
+ const cudnnNanPropagation_t maxpoolingNanOpt,
508
+ int nbDims,
509
+ const int windowDimA[],
510
+ const int paddingA[],
511
+ const int strideA[]);
512
+
513
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
514
+ cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
515
+ int nbDimsRequested,
516
+ cudnnPoolingMode_t *mode,
517
+ cudnnNanPropagation_t *maxpoolingNanOpt,
518
+ int *nbDims,
519
+ int windowDimA[],
520
+ int paddingA[],
521
+ int strideA[]);
522
+
523
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
524
+ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
525
+ const cudnnTensorDescriptor_t inputTensorDesc,
526
+ int nbDims,
527
+ int outputTensorDimA[]);
528
+
529
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
530
+ cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
531
+ const cudnnTensorDescriptor_t inputTensorDesc,
532
+ int *n,
533
+ int *c,
534
+ int *h,
535
+ int *w);
536
+
537
+ /* Destroy an instance of pooling descriptor */
538
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
539
+ cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
540
+
541
+ /* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
542
+
543
+ /* Function to perform forward pooling */
544
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
545
+ cudnnPoolingForward(cudnnHandle_t handle,
546
+ const cudnnPoolingDescriptor_t poolingDesc,
547
+ const void *alpha,
548
+ const cudnnTensorDescriptor_t xDesc,
549
+ const void *x,
550
+ const void *beta,
551
+ const cudnnTensorDescriptor_t yDesc,
552
+ void *y);
553
+
554
+ /* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
555
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
556
+ cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
557
+
558
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
559
+ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
560
+ cudnnActivationMode_t mode,
561
+ cudnnNanPropagation_t reluNanOpt,
562
+ double coef); /* ceiling for clipped RELU, alpha for ELU */
563
+
564
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
565
+ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
566
+ cudnnActivationMode_t *mode,
567
+ cudnnNanPropagation_t *reluNanOpt,
568
+ double *coef); /* ceiling for clipped RELU, alpha for ELU */
569
+
570
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
571
+ cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
572
+
573
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
574
+ cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
575
+
576
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
577
+ cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
578
+
579
+ /* Function to perform forward activation */
580
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
581
+ cudnnActivationForward(cudnnHandle_t handle,
582
+ cudnnActivationDescriptor_t activationDesc,
583
+ const void *alpha,
584
+ const cudnnTensorDescriptor_t xDesc,
585
+ const void *x,
586
+ const void *beta,
587
+ const cudnnTensorDescriptor_t yDesc,
588
+ void *y);
589
+
590
+ /*
591
+ * Create an instance of LRN (Local Response Normalization) descriptor
592
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
593
+ */
594
+ cudnnStatus_t CUDNNWINAPI
595
+ cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
596
+
597
+ #define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */
598
+ #define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */
599
+ #define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */
600
+ #define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
601
+
602
+ /* LRN layer mode */
603
+ typedef enum {
604
+ CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
605
+ } cudnnLRNMode_t;
606
+
607
+ /*
608
+ * Uses a window [center-lookBehind, center+lookAhead], where
609
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
610
+ * Values of double parameters cast to tensor data type.
611
+ */
612
+ cudnnStatus_t CUDNNWINAPI
613
+ cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
614
+ /*
615
+ * Retrieve the settings currently stored in an LRN layer descriptor
616
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
617
+ */
618
+ cudnnStatus_t CUDNNWINAPI
619
+ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
620
+
621
+ /* Destroy an instance of LRN descriptor */
622
+ cudnnStatus_t CUDNNWINAPI
623
+ cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
624
+
625
+ /* LRN functions: output = alpha * normalize(x) + beta * old_y */
626
+
627
+ /* LRN cross-channel forward computation. Double parameters cast to tensor data type */
628
+ cudnnStatus_t CUDNNWINAPI
629
+ cudnnLRNCrossChannelForward(cudnnHandle_t handle,
630
+ cudnnLRNDescriptor_t normDesc,
631
+ cudnnLRNMode_t lrnMode,
632
+ const void *alpha,
633
+ const cudnnTensorDescriptor_t xDesc,
634
+ const void *x,
635
+ const void *beta,
636
+ const cudnnTensorDescriptor_t yDesc,
637
+ void *y);
638
+
639
+ typedef enum {
640
+ CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
641
+ } cudnnDivNormMode_t;
642
+
643
+ /* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
644
+ cudnnStatus_t CUDNNWINAPI
645
+ cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
646
+ cudnnLRNDescriptor_t normDesc,
647
+ cudnnDivNormMode_t mode,
648
+ const void *alpha,
649
+ const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
650
+ const void *x,
651
+ const void *means, /* if NULL, means are assumed to be zero */
652
+ void *temp,
653
+ void *temp2,
654
+ const void *beta,
655
+ const cudnnTensorDescriptor_t yDesc,
656
+ void *y);
657
+
658
+ typedef enum {
659
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
660
+ CUDNN_BATCHNORM_PER_ACTIVATION = 0,
661
+
662
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
663
+ CUDNN_BATCHNORM_SPATIAL = 1,
664
+
665
+ /*
666
+ * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
667
+ * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
668
+ */
669
+ CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
670
+ } cudnnBatchNormMode_t CUDNN_DEPRECATED;
671
+
672
+ #define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
673
+
674
+ /*
675
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
676
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
677
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
678
+ */
679
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
680
+ cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
681
+ const cudnnTensorDescriptor_t xDesc,
682
+ cudnnBatchNormMode_t mode);
683
+
684
+ typedef enum {
685
+ CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */
686
+ CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */
687
+ CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
688
+ } cudnnBatchNormOps_t CUDNN_DEPRECATED;
689
+
690
+ /*
691
+ * Performs Batch Normalization during Inference:
692
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
693
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
694
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
695
+ * above for notes on function arguments.
696
+ */
697
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
698
+ cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
699
+ cudnnBatchNormMode_t mode,
700
+ const void *alpha, /* alpha[0] = result blend factor */
701
+ const void *beta, /* beta[0] = dest layer blend factor */
702
+ const cudnnTensorDescriptor_t xDesc,
703
+ const void *x, /* NxCxHxW */
704
+ const cudnnTensorDescriptor_t yDesc,
705
+ void *y, /* NxCxHxW */
706
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
707
+ const void *bnScale,
708
+ const void *bnBias,
709
+ const void *estimatedMean,
710
+ const void *estimatedVariance,
711
+ double epsilon);
712
+
713
+ typedef enum {
714
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
715
+ CUDNN_NORM_PER_ACTIVATION = 0,
716
+
717
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
718
+ CUDNN_NORM_PER_CHANNEL = 1,
719
+ } cudnnNormMode_t CUDNN_DEPRECATED;
720
+
721
+ typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
722
+
723
+ /*
724
+ * Derives a tensor descriptor from layer data descriptor for Normalization
725
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
726
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
727
+ */
728
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
729
+ cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
730
+ cudnnTensorDescriptor_t derivedNormMeanVarDesc,
731
+ const cudnnTensorDescriptor_t xDesc,
732
+ cudnnNormMode_t mode,
733
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
734
+
735
+ typedef enum {
736
+ CUDNN_NORM_OPS_NORM = 0, /* do normalization only */
737
+ CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */
738
+ CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
739
+ } cudnnNormOps_t CUDNN_DEPRECATED;
740
+
741
+ /*
742
+ * Performs Normalization during Inference:
743
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
744
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
745
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
746
+ * above for notes on function arguments.
747
+ */
748
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
749
+ cudnnNormalizationForwardInference(cudnnHandle_t handle,
750
+ cudnnNormMode_t mode,
751
+ cudnnNormOps_t normOps,
752
+ cudnnNormAlgo_t algo,
753
+ const void *alpha, /* alpha[0] = result blend factor */
754
+ const void *beta, /* beta[0] = dest layer blend factor */
755
+ const cudnnTensorDescriptor_t xDesc,
756
+ const void *x, /* NxCxHxW */
757
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
758
+ const void *normScale,
759
+ const void *normBias,
760
+ const cudnnTensorDescriptor_t normMeanVarDesc,
761
+ const void *estimatedMean,
762
+ const void *estimatedVariance,
763
+ const cudnnTensorDescriptor_t zDesc,
764
+ const void *z,
765
+ cudnnActivationDescriptor_t activationDesc,
766
+ const cudnnTensorDescriptor_t yDesc,
767
+ void *y, /* NxCxHxW */
768
+ double epsilon,
769
+ int groupCnt); /* Place hold for future work*/
770
+
771
+ /* APIs for spatial transformer network*/
772
+ typedef enum {
773
+ CUDNN_SAMPLER_BILINEAR = 0,
774
+ } cudnnSamplerType_t;
775
+
776
+ cudnnStatus_t CUDNNWINAPI
777
+ cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
778
+
779
+ cudnnStatus_t CUDNNWINAPI
780
+ cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
781
+ cudnnSamplerType_t samplerType,
782
+ cudnnDataType_t dataType,
783
+ const int nbDims,
784
+ const int dimA[]);
785
+
786
+ cudnnStatus_t CUDNNWINAPI
787
+ cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
788
+
789
+ cudnnStatus_t CUDNNWINAPI
790
+ cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
791
+ const cudnnSpatialTransformerDescriptor_t stDesc,
792
+ const void *theta,
793
+ void *grid);
794
+
795
+ cudnnStatus_t CUDNNWINAPI
796
+ cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
797
+ cudnnSpatialTransformerDescriptor_t stDesc,
798
+ const void *alpha,
799
+ const cudnnTensorDescriptor_t xDesc,
800
+ const void *x,
801
+ const void *grid,
802
+ const void *beta,
803
+ cudnnTensorDescriptor_t yDesc,
804
+ void *y);
805
+
806
+ typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
807
+
808
+ cudnnStatus_t CUDNNWINAPI
809
+ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
810
+
811
+ cudnnStatus_t CUDNNWINAPI
812
+ cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
813
+
814
+ /*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
815
+ cudnnStatus_t CUDNNWINAPI
816
+ cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
817
+
818
+ /*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
819
+ cudnnStatus_t CUDNNWINAPI
820
+ cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
821
+
822
+ cudnnStatus_t CUDNNWINAPI
823
+ cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
824
+ cudnnHandle_t handle,
825
+ float dropout,
826
+ void *states,
827
+ size_t stateSizeInBytes,
828
+ unsigned long long seed);
829
+
830
+ /* Restores the dropout descriptor to a previously saved-off state */
831
+ cudnnStatus_t CUDNNWINAPI
832
+ cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
833
+ cudnnHandle_t handle,
834
+ float dropout,
835
+ void *states,
836
+ size_t stateSizeInBytes,
837
+ unsigned long long seed);
838
+
839
+ cudnnStatus_t CUDNNWINAPI
840
+ cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
841
+ cudnnHandle_t handle,
842
+ float *dropout,
843
+ void **states,
844
+ unsigned long long *seed);
845
+
846
+ cudnnStatus_t CUDNNWINAPI
847
+ cudnnDropoutForward(cudnnHandle_t handle,
848
+ const cudnnDropoutDescriptor_t dropoutDesc,
849
+ const cudnnTensorDescriptor_t xdesc,
850
+ const void *x,
851
+ const cudnnTensorDescriptor_t ydesc,
852
+ void *y,
853
+ void *reserveSpace,
854
+ size_t reserveSpaceSizeInBytes);
855
+
856
+ /* TODO: move these enums out to the appropriate submodule */
857
+ typedef enum {
858
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
859
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
860
+ CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
861
+ CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
862
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
863
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
864
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
865
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
866
+ CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8
867
+ } cudnnConvolutionFwdAlgo_t;
868
+
869
+ typedef enum {
870
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */
871
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
872
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
873
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */
874
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */
875
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
876
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
877
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7
878
+ } cudnnConvolutionBwdFilterAlgo_t;
879
+
880
+ typedef enum {
881
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */
882
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
883
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
884
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
885
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
886
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
887
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6
888
+ } cudnnConvolutionBwdDataAlgo_t;
889
+
890
+ typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
891
+
892
+ /*
893
+ * \brief Cross-library version checker.
894
+ * This function is implemented differently in each sub-library. Each sublib
895
+ * checks whether its own version matches that of its dependencies.
896
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
897
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
898
+ */
899
+ cudnnStatus_t CUDNNWINAPI
900
+ cudnnOpsVersionCheck(void);
901
+
902
+ /* Function to perform backward softmax */
903
+ cudnnStatus_t CUDNNWINAPI
904
+ cudnnSoftmaxBackward(cudnnHandle_t handle,
905
+ cudnnSoftmaxAlgorithm_t algo,
906
+ cudnnSoftmaxMode_t mode,
907
+ const void *alpha,
908
+ const cudnnTensorDescriptor_t yDesc,
909
+ const void *y,
910
+ const cudnnTensorDescriptor_t dyDesc,
911
+ const void *dy,
912
+ const void *beta,
913
+ const cudnnTensorDescriptor_t dxDesc,
914
+ void *dx);
915
+
916
+ /* Function to perform backward pooling */
917
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
918
+ cudnnPoolingBackward(cudnnHandle_t handle,
919
+ const cudnnPoolingDescriptor_t poolingDesc,
920
+ const void *alpha,
921
+ const cudnnTensorDescriptor_t yDesc,
922
+ const void *y,
923
+ const cudnnTensorDescriptor_t dyDesc,
924
+ const void *dy,
925
+ const cudnnTensorDescriptor_t xDesc,
926
+ const void *x,
927
+ const void *beta,
928
+ const cudnnTensorDescriptor_t dxDesc,
929
+ void *dx);
930
+
931
+ /* Function to perform backward activation */
932
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
933
+ cudnnActivationBackward(cudnnHandle_t handle,
934
+ cudnnActivationDescriptor_t activationDesc,
935
+ const void *alpha,
936
+ const cudnnTensorDescriptor_t yDesc,
937
+ const void *y,
938
+ const cudnnTensorDescriptor_t dyDesc,
939
+ const void *dy,
940
+ const cudnnTensorDescriptor_t xDesc,
941
+ const void *x,
942
+ const void *beta,
943
+ const cudnnTensorDescriptor_t dxDesc,
944
+ void *dx);
945
+
946
+ /* LRN cross-channel backward computation. Double parameters cast to tensor data type */
947
+ cudnnStatus_t CUDNNWINAPI
948
+ cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
949
+ cudnnLRNDescriptor_t normDesc,
950
+ cudnnLRNMode_t lrnMode,
951
+ const void *alpha,
952
+ const cudnnTensorDescriptor_t yDesc,
953
+ const void *y,
954
+ const cudnnTensorDescriptor_t dyDesc,
955
+ const void *dy,
956
+ const cudnnTensorDescriptor_t xDesc,
957
+ const void *x,
958
+ const void *beta,
959
+ const cudnnTensorDescriptor_t dxDesc,
960
+ void *dx);
961
+
962
+ cudnnStatus_t CUDNNWINAPI
963
+ cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
964
+ cudnnLRNDescriptor_t normDesc,
965
+ cudnnDivNormMode_t mode,
966
+ const void *alpha,
967
+ const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
968
+ const void *x,
969
+ const void *means, /* if NULL, means are assumed to be zero */
970
+ const void *dy,
971
+ void *temp,
972
+ void *temp2,
973
+ const void *beta,
974
+ const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
975
+ void *dx, /* output x differential */
976
+ void *dMeans); /* output means differential, can be NULL */
977
+
978
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
979
+ cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
980
+ cudnnBatchNormMode_t mode,
981
+ cudnnBatchNormOps_t bnOps,
982
+ const cudnnTensorDescriptor_t xDesc,
983
+ const cudnnTensorDescriptor_t zDesc,
984
+ const cudnnTensorDescriptor_t yDesc,
985
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
986
+ const cudnnActivationDescriptor_t activationDesc,
987
+ size_t *sizeInBytes);
988
+
989
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
990
+ cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
991
+ cudnnBatchNormMode_t mode,
992
+ cudnnBatchNormOps_t bnOps,
993
+ const cudnnTensorDescriptor_t xDesc,
994
+ const cudnnTensorDescriptor_t yDesc,
995
+ const cudnnTensorDescriptor_t dyDesc,
996
+ const cudnnTensorDescriptor_t dzDesc,
997
+ const cudnnTensorDescriptor_t dxDesc,
998
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
999
+ const cudnnActivationDescriptor_t activationDesc,
1000
+ size_t *sizeInBytes);
1001
+
1002
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1003
+ cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
1004
+ cudnnBatchNormMode_t mode,
1005
+ cudnnBatchNormOps_t bnOps,
1006
+ const cudnnActivationDescriptor_t activationDesc,
1007
+ const cudnnTensorDescriptor_t xDesc,
1008
+ size_t *sizeInBytes);
1009
+
1010
+ /* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
1011
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1012
+ cudnnBatchNormalizationForwardTraining(
1013
+ cudnnHandle_t handle,
1014
+ cudnnBatchNormMode_t mode,
1015
+
1016
+ const void *alpha, /* alpha[0] = result blend factor */
1017
+ const void *beta, /* beta[0] = dest layer blend factor */
1018
+
1019
+ const cudnnTensorDescriptor_t xDesc,
1020
+ const void *x, /* NxCxHxW */
1021
+ const cudnnTensorDescriptor_t yDesc,
1022
+ void *y, /* NxCxHxW */
1023
+
1024
+ /* Shared desc for the next 6 tensors in the argument list.
1025
+ Data type to be set as follows:
1026
+ type = (typeOf(x) == double) ? double : float
1027
+ Dimensions for this descriptor depend on normalization mode
1028
+ - Spatial Normalization : tensors are expected to have dims 1xCx1x1
1029
+ (normalization is performed across NxHxW)
1030
+ - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
1031
+ (normalization is performed across N) */
1032
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
1033
+
1034
+ /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
1035
+ const void *bnScale,
1036
+ const void *bnBias,
1037
+
1038
+ /* MUST use factor=1 in the very first call of a complete training cycle.
1039
+ Use a factor=1/(1+n) at N-th call to the function to get
1040
+ Cumulative Moving Average (CMA) behavior
1041
+ CMA[n] = (x[1]+...+x[n])/n
1042
+ Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
1043
+ ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
1044
+ CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
1045
+ double exponentialAverageFactor,
1046
+
1047
+ /* Used in Training phase only.
1048
+ runningMean = newMean*factor + runningMean*(1-factor) */
1049
+ void *resultRunningMean,
1050
+ /* Output in training mode, input in inference. Is the moving average
1051
+ of variance[x] (factor is applied in the same way as for runningMean) */
1052
+ void *resultRunningVariance,
1053
+
1054
+ /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
1055
+ double epsilon,
1056
+
1057
+ /* Optionally save intermediate results from the forward pass here
1058
+ - can be reused to speed up backward pass. NULL if unused */
1059
+ void *resultSaveMean,
1060
+ void *resultSaveInvVariance);
1061
+
1062
+ /* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
1063
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1064
+ cudnnBatchNormalizationForwardTrainingEx(
1065
+ cudnnHandle_t handle,
1066
+ cudnnBatchNormMode_t mode,
1067
+ cudnnBatchNormOps_t bnOps,
1068
+
1069
+ const void *alpha, /* alpha[0] = result blend factor */
1070
+ const void *beta, /* beta[0] = dest layer blend factor */
1071
+
1072
+ const cudnnTensorDescriptor_t xDesc,
1073
+ const void *xData,
1074
+ const cudnnTensorDescriptor_t zDesc,
1075
+ const void *zData,
1076
+ const cudnnTensorDescriptor_t yDesc,
1077
+ void *yData,
1078
+
1079
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
1080
+ const void *bnScale,
1081
+ const void *bnBias,
1082
+
1083
+ double exponentialAverageFactor,
1084
+ void *resultRunningMean,
1085
+ void *resultRunningVariance,
1086
+
1087
+ /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
1088
+ double epsilon,
1089
+
1090
+ /* Optionally save intermediate results from the forward pass here
1091
+ - can be reused to speed up backward pass. NULL if unused */
1092
+ void *resultSaveMean,
1093
+ void *resultSaveInvVariance,
1094
+
1095
+ cudnnActivationDescriptor_t activationDesc,
1096
+ void *workspace,
1097
+ size_t workSpaceSizeInBytes,
1098
+ void *reserveSpace,
1099
+ size_t reserveSpaceSizeInBytes);
1100
+
1101
+ /* Performs backward pass of Batch Normalization layer. Returns x gradient,
1102
+ * bnScale gradient and bnBias gradient */
1103
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1104
+ cudnnBatchNormalizationBackward(cudnnHandle_t handle,
1105
+ cudnnBatchNormMode_t mode,
1106
+ const void *alphaDataDiff,
1107
+ const void *betaDataDiff,
1108
+ const void *alphaParamDiff,
1109
+ const void *betaParamDiff,
1110
+ const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
1111
+ const void *x,
1112
+ const cudnnTensorDescriptor_t dyDesc,
1113
+ const void *dy,
1114
+ const cudnnTensorDescriptor_t dxDesc,
1115
+ void *dx,
1116
+ /* Shared tensor desc for the 4 tensors below */
1117
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
1118
+ const void *bnScale, /* bnBias doesn't affect backpropagation */
1119
+ /* scale and bias diff are not backpropagated below this layer */
1120
+ void *dBnScaleResult,
1121
+ void *dBnBiasResult,
1122
+ /* Same epsilon as forward pass */
1123
+ double epsilon,
1124
+
1125
+ /* Optionally cached intermediate results from
1126
+ forward pass */
1127
+ const void *savedMean,
1128
+ const void *savedInvVariance);
1129
+
1130
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1131
+ cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
1132
+ cudnnBatchNormMode_t mode,
1133
+ cudnnBatchNormOps_t bnOps,
1134
+
1135
+ const void *alphaDataDiff,
1136
+ const void *betaDataDiff,
1137
+ const void *alphaParamDiff,
1138
+ const void *betaParamDiff,
1139
+ const cudnnTensorDescriptor_t xDesc,
1140
+ const void *xData,
1141
+ const cudnnTensorDescriptor_t yDesc,
1142
+ const void *yData,
1143
+ const cudnnTensorDescriptor_t dyDesc,
1144
+ const void *dyData,
1145
+ const cudnnTensorDescriptor_t dzDesc,
1146
+ void *dzData,
1147
+ const cudnnTensorDescriptor_t dxDesc,
1148
+ void *dxData,
1149
+
1150
+ /* Shared tensor desc for the 4 tensors below */
1151
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
1152
+ const void *bnScaleData,
1153
+ const void *bnBiasData, /* needed if there is activation */
1154
+ void *dBnScaleData,
1155
+ void *dBnBiasData,
1156
+ double epsilon, /* Same epsilon as forward pass */
1157
+
1158
+ /* Optionally cached intermediate results from
1159
+ forward pass */
1160
+ const void *savedMean,
1161
+ const void *savedInvVariance,
1162
+ cudnnActivationDescriptor_t activationDesc,
1163
+ void *workSpace,
1164
+ size_t workSpaceSizeInBytes,
1165
+ void *reserveSpace,
1166
+ size_t reserveSpaceSizeInBytes);
1167
+
1168
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1169
+ cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
1170
+ cudnnNormMode_t mode,
1171
+ cudnnNormOps_t normOps,
1172
+ cudnnNormAlgo_t algo,
1173
+ const cudnnTensorDescriptor_t xDesc,
1174
+ const cudnnTensorDescriptor_t zDesc,
1175
+ const cudnnTensorDescriptor_t yDesc,
1176
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
1177
+ const cudnnActivationDescriptor_t activationDesc,
1178
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1179
+ size_t *sizeInBytes,
1180
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1181
+
1182
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1183
+ cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
1184
+ cudnnNormMode_t mode,
1185
+ cudnnNormOps_t normOps,
1186
+ cudnnNormAlgo_t algo,
1187
+ const cudnnTensorDescriptor_t xDesc,
1188
+ const cudnnTensorDescriptor_t yDesc,
1189
+ const cudnnTensorDescriptor_t dyDesc,
1190
+ const cudnnTensorDescriptor_t dzDesc,
1191
+ const cudnnTensorDescriptor_t dxDesc,
1192
+ const cudnnTensorDescriptor_t dNormScaleBiasDesc,
1193
+ const cudnnActivationDescriptor_t activationDesc,
1194
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1195
+ size_t *sizeInBytes,
1196
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1197
+
1198
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1199
+ cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
1200
+ cudnnNormMode_t mode,
1201
+ cudnnNormOps_t normOps,
1202
+ cudnnNormAlgo_t algo,
1203
+ const cudnnActivationDescriptor_t activationDesc,
1204
+ const cudnnTensorDescriptor_t xDesc,
1205
+ size_t *sizeInBytes,
1206
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1207
+
1208
+ /* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
1209
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1210
+ cudnnNormalizationForwardTraining(cudnnHandle_t handle,
1211
+ cudnnNormMode_t mode,
1212
+ cudnnNormOps_t normOps,
1213
+ cudnnNormAlgo_t algo,
1214
+ const void *alpha, /* alpha[0] = result blend factor */
1215
+ const void *beta, /* beta[0] = dest layer blend factor */
1216
+ const cudnnTensorDescriptor_t xDesc,
1217
+ const void *xData,
1218
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
1219
+ const void *normScale,
1220
+ const void *normBias,
1221
+ double exponentialAverageFactor,
1222
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1223
+ void *resultRunningMean,
1224
+ void *resultRunningVariance,
1225
+ /* Has to be >= 0. Should be the same in forward and backward functions. */
1226
+ double epsilon,
1227
+ /* Optionally save intermediate results from the forward pass here
1228
+ - can be reused to speed up backward pass. NULL if unused */
1229
+ void *resultSaveMean,
1230
+ void *resultSaveInvVariance,
1231
+ cudnnActivationDescriptor_t activationDesc,
1232
+ const cudnnTensorDescriptor_t zDesc,
1233
+ const void *zData,
1234
+ const cudnnTensorDescriptor_t yDesc,
1235
+ void *yData,
1236
+ void *workspace,
1237
+ size_t workSpaceSizeInBytes,
1238
+ void *reserveSpace,
1239
+ size_t reserveSpaceSizeInBytes,
1240
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1241
+
1242
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1243
+ cudnnNormalizationBackward(cudnnHandle_t handle,
1244
+ cudnnNormMode_t mode,
1245
+ cudnnNormOps_t normOps,
1246
+ cudnnNormAlgo_t algo,
1247
+ const void *alphaDataDiff,
1248
+ const void *betaDataDiff,
1249
+ const void *alphaParamDiff,
1250
+ const void *betaParamDiff,
1251
+ const cudnnTensorDescriptor_t xDesc,
1252
+ const void *xData,
1253
+ const cudnnTensorDescriptor_t yDesc,
1254
+ const void *yData,
1255
+ const cudnnTensorDescriptor_t dyDesc,
1256
+ const void *dyData,
1257
+ const cudnnTensorDescriptor_t dzDesc,
1258
+ void *dzData,
1259
+ const cudnnTensorDescriptor_t dxDesc,
1260
+ void *dxData,
1261
+ /* Shared tensor desc for the 4 tensors below */
1262
+ const cudnnTensorDescriptor_t dNormScaleBiasDesc,
1263
+ const void *normScaleData,
1264
+ const void *normBiasData, /* needed if there is activation */
1265
+ void *dNormScaleData,
1266
+ void *dNormBiasData,
1267
+ double epsilon, /* Same epsilon as forward pass */
1268
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1269
+ /* Optionally cached intermediate results from
1270
+ forward pass */
1271
+ const void *savedMean,
1272
+ const void *savedInvVariance,
1273
+ cudnnActivationDescriptor_t activationDesc,
1274
+ void *workSpace,
1275
+ size_t workSpaceSizeInBytes,
1276
+ void *reserveSpace,
1277
+ size_t reserveSpaceSizeInBytes,
1278
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1279
+
1280
+ cudnnStatus_t CUDNNWINAPI
1281
+ cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
1282
+ const cudnnSpatialTransformerDescriptor_t stDesc,
1283
+ const void *dgrid,
1284
+ void *dtheta);
1285
+
1286
+ cudnnStatus_t CUDNNWINAPI
1287
+ cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
1288
+ cudnnSpatialTransformerDescriptor_t stDesc,
1289
+ const void *alpha,
1290
+ const cudnnTensorDescriptor_t xDesc,
1291
+ const void *x,
1292
+ const void *beta,
1293
+ const cudnnTensorDescriptor_t dxDesc,
1294
+ void *dx,
1295
+ const void *alphaDgrid,
1296
+ const cudnnTensorDescriptor_t dyDesc,
1297
+ const void *dy,
1298
+ const void *grid,
1299
+ const void *betaDgrid,
1300
+ void *dgrid);
1301
+
1302
+ cudnnStatus_t CUDNNWINAPI
1303
+ cudnnDropoutBackward(cudnnHandle_t handle,
1304
+ const cudnnDropoutDescriptor_t dropoutDesc,
1305
+ const cudnnTensorDescriptor_t dydesc,
1306
+ const void *dy,
1307
+ const cudnnTensorDescriptor_t dxdesc,
1308
+ void *dx,
1309
+ void *reserveSpace,
1310
+ size_t reserveSpaceSizeInBytes);
1311
+
1312
+ #if defined(__cplusplus)
1313
+ }
1314
+ #endif
1315
+
1316
+ #endif /* CUDNN_OPS_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h ADDED
@@ -0,0 +1,1316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * cudnn_ops : cuDNN's basic definitions and basic operations.
52
+ */
53
+
54
+ #if !defined(CUDNN_OPS_H_)
55
+ #define CUDNN_OPS_H_
56
+
57
+ #include <stdint.h>
58
+
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_graph.h"
61
+
62
+ /* These version numbers are autogenerated, do not edit manually. */
63
+ #define CUDNN_OPS_MAJOR 9
64
+ #define CUDNN_OPS_MINOR 10
65
+ #define CUDNN_OPS_PATCH 2
66
+
67
+ #if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
68
+ #error Version mismatch in cuDNN OPS INFER!!!
69
+ #endif
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif
74
+
75
+ /* Data structures to represent Image/Filter and the Neural Network Layer */
76
+ typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
77
+ typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
78
+ typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
79
+ typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
80
+ typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
81
+ typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
82
+ typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
83
+ typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
84
+ typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
85
+ typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
86
+ /*
87
+ * CUDNN Determinism
88
+ */
89
+ typedef enum {
90
+ CUDNN_NON_DETERMINISTIC = 0,
91
+ CUDNN_DETERMINISTIC = 1,
92
+ } cudnnDeterminism_t;
93
+
94
+ /* Create an instance of a generic Tensor descriptor */
95
+ cudnnStatus_t CUDNNWINAPI
96
+ cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
97
+
98
+ cudnnStatus_t CUDNNWINAPI
99
+ cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
100
+ cudnnTensorFormat_t format,
101
+ cudnnDataType_t dataType, /* image data type */
102
+ int n, /* number of inputs (batch size) */
103
+ int c, /* number of input feature maps */
104
+ int h, /* height of input section */
105
+ int w); /* width of input section */
106
+
107
+ cudnnStatus_t CUDNNWINAPI
108
+ cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
109
+ cudnnDataType_t dataType, /* image data type */
110
+ int n, /* number of inputs (batch size) */
111
+ int c, /* number of input feature maps */
112
+ int h, /* height of input section */
113
+ int w, /* width of input section */
114
+ int nStride,
115
+ int cStride,
116
+ int hStride,
117
+ int wStride);
118
+
119
+ cudnnStatus_t CUDNNWINAPI
120
+ cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
121
+ cudnnDataType_t *dataType, /* image data type */
122
+ int *n, /* number of inputs (batch size) */
123
+ int *c, /* number of input feature maps */
124
+ int *h, /* height of input section */
125
+ int *w, /* width of input section */
126
+ int *nStride,
127
+ int *cStride,
128
+ int *hStride,
129
+ int *wStride);
130
+
131
+ cudnnStatus_t CUDNNWINAPI
132
+ cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
133
+ cudnnDataType_t dataType,
134
+ int nbDims,
135
+ const int dimA[],
136
+ const int strideA[]);
137
+
138
+ cudnnStatus_t CUDNNWINAPI
139
+ cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
140
+ cudnnTensorFormat_t format,
141
+ cudnnDataType_t dataType,
142
+ int nbDims,
143
+ const int dimA[]);
144
+
145
+ cudnnStatus_t CUDNNWINAPI
146
+ cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
147
+ int nbDimsRequested,
148
+ cudnnDataType_t *dataType,
149
+ int *nbDims,
150
+ int dimA[],
151
+ int strideA[]);
152
+
153
+ cudnnStatus_t CUDNNWINAPI
154
+ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
155
+
156
+ /* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
157
+
158
+ 1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
159
+ input_stride : c x h x h_stride
160
+ feature_stride : h x h_stride
161
+ h_stride : >= w ( h_stride = w if no padding)
162
+ w_stride : 1
163
+
164
+
165
+ 2)Example of all images in row major with features maps interleaved
166
+ input_stride : c x h x h_stride
167
+ feature_stride : 1
168
+ h_stride : w x c
169
+ w_stride : c
170
+
171
+ 3)Example of all images in column major order one batch of features after the other (with optional padding on column)
172
+ input_stride : c x w x w_stride
173
+ feature_stride : w x w_stride
174
+ h_stride : 1
175
+ w_stride : >= h
176
+
177
+ */
178
+
179
+ /* Destroy an instance of Tensor4d descriptor */
180
+ cudnnStatus_t CUDNNWINAPI
181
+ cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
182
+
183
+ /* Fold/unfold transforms */
184
+ typedef enum {
185
+ CUDNN_TRANSFORM_FOLD = 0U,
186
+ CUDNN_TRANSFORM_UNFOLD = 1U,
187
+ } cudnnFoldingDirection_t;
188
+
189
+ /** Create a destination descriptor for cudnnTransformTensor */
190
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
191
+ cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
192
+ const cudnnTensorDescriptor_t srcDesc,
193
+ cudnnTensorDescriptor_t destDesc,
194
+ size_t *destSizeInBytes);
195
+
196
+ /** Create an empty tensor transform descriptor */
197
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
198
+ cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
199
+
200
+ /** Initialize a previously created tensor transform descriptor. */
201
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
202
+ cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
203
+ const uint32_t nbDims,
204
+ const cudnnTensorFormat_t destFormat,
205
+ const int32_t padBeforeA[],
206
+ const int32_t padAfterA[],
207
+ const uint32_t foldA[],
208
+ const cudnnFoldingDirection_t direction);
209
+
210
+ /**
211
+ * Retrieves the values stored in a previously initialized tensor transform
212
+ * descriptor.
213
+ */
214
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
215
+ cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
216
+ uint32_t nbDimsRequested,
217
+ cudnnTensorFormat_t *destFormat,
218
+ int32_t padBeforeA[],
219
+ int32_t padAfterA[],
220
+ uint32_t foldA[],
221
+ cudnnFoldingDirection_t *direction);
222
+
223
+ /**
224
+ * Destroys a previously created tensor transform descriptor.
225
+ */
226
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
227
+ cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
228
+
229
+ /* Tensor layout conversion helper (y = alpha * x + beta * y) */
230
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
231
+ cudnnTransformTensor(cudnnHandle_t handle,
232
+ const void *alpha,
233
+ const cudnnTensorDescriptor_t xDesc,
234
+ const void *x,
235
+ const void *beta,
236
+ const cudnnTensorDescriptor_t yDesc,
237
+ void *y);
238
+
239
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
240
+ cudnnTransformTensorEx(cudnnHandle_t handle,
241
+ const cudnnTensorTransformDescriptor_t transDesc,
242
+ const void *alpha,
243
+ const cudnnTensorDescriptor_t srcDesc,
244
+ const void *srcData,
245
+ const void *beta,
246
+ const cudnnTensorDescriptor_t destDesc,
247
+ void *destData);
248
+
249
+ /* Tensor Bias addition : C = alpha * A + beta * C */
250
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
251
+ cudnnAddTensor(cudnnHandle_t handle,
252
+ const void *alpha,
253
+ const cudnnTensorDescriptor_t aDesc,
254
+ const void *A,
255
+ const void *beta,
256
+ const cudnnTensorDescriptor_t cDesc,
257
+ void *C);
258
+
259
+ /*
260
+ * CUDNN OpTensor op type
261
+ */
262
+ typedef enum {
263
+ CUDNN_OP_TENSOR_ADD = 0,
264
+ CUDNN_OP_TENSOR_MUL = 1,
265
+ CUDNN_OP_TENSOR_MIN = 2,
266
+ CUDNN_OP_TENSOR_MAX = 3,
267
+ CUDNN_OP_TENSOR_SQRT = 4,
268
+ CUDNN_OP_TENSOR_NOT = 5,
269
+ } cudnnOpTensorOp_t;
270
+
271
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
272
+ cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
273
+
274
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
275
+ cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
276
+ cudnnOpTensorOp_t opTensorOp,
277
+ cudnnDataType_t opTensorCompType,
278
+ cudnnNanPropagation_t opTensorNanOpt);
279
+
280
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
281
+ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
282
+ cudnnOpTensorOp_t *opTensorOp,
283
+ cudnnDataType_t *opTensorCompType,
284
+ cudnnNanPropagation_t *opTensorNanOpt);
285
+
286
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
287
+ cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
288
+
289
+ /* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
290
+ /* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
291
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
292
+ cudnnOpTensor(cudnnHandle_t handle,
293
+ const cudnnOpTensorDescriptor_t opTensorDesc,
294
+ const void *alpha1,
295
+ const cudnnTensorDescriptor_t aDesc,
296
+ const void *A,
297
+ const void *alpha2,
298
+ const cudnnTensorDescriptor_t bDesc,
299
+ const void *B,
300
+ const void *beta,
301
+ const cudnnTensorDescriptor_t cDesc,
302
+ void *C);
303
+
304
+ /*
305
+ * CUDNN ReduceTensor indices type
306
+ */
307
+ typedef enum {
308
+ CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
309
+ CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
310
+ } cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
311
+
312
+ /*
313
+ * CUDNN tensor indices type size (all unsigned)
314
+ * Currently not supported, default is 32 bit unsigned.
315
+ */
316
+ typedef enum {
317
+ CUDNN_32BIT_INDICES = 0,
318
+ CUDNN_64BIT_INDICES = 1,
319
+ CUDNN_16BIT_INDICES = 2,
320
+ CUDNN_8BIT_INDICES = 3,
321
+ } cudnnIndicesType_t CUDNN_DEPRECATED;
322
+
323
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
324
+ cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
325
+
326
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
327
+ cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
328
+ cudnnReduceTensorOp_t reduceTensorOp,
329
+ cudnnDataType_t reduceTensorCompType,
330
+ cudnnNanPropagation_t reduceTensorNanOpt,
331
+ cudnnReduceTensorIndices_t reduceTensorIndices,
332
+ cudnnIndicesType_t reduceTensorIndicesType);
333
+
334
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
335
+ cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
336
+ cudnnReduceTensorOp_t *reduceTensorOp,
337
+ cudnnDataType_t *reduceTensorCompType,
338
+ cudnnNanPropagation_t *reduceTensorNanOpt,
339
+ cudnnReduceTensorIndices_t *reduceTensorIndices,
340
+ cudnnIndicesType_t *reduceTensorIndicesType);
341
+
342
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
343
+ cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
344
+
345
+ /* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
346
+ * output tensors */
347
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
348
+ cudnnGetReductionIndicesSize(cudnnHandle_t handle,
349
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
350
+ const cudnnTensorDescriptor_t aDesc,
351
+ const cudnnTensorDescriptor_t cDesc,
352
+ size_t *sizeInBytes);
353
+
354
+ /* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
355
+ * tensors */
356
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
357
+ cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
358
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
359
+ const cudnnTensorDescriptor_t aDesc,
360
+ const cudnnTensorDescriptor_t cDesc,
361
+ size_t *sizeInBytes);
362
+
363
+ /* Tensor operation : C = reduce op( alpha * A ) + beta * C */
364
+ /* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
365
+ /* The indices space is ignored for reduce ops other than min or max. */
366
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
367
+ cudnnReduceTensor(cudnnHandle_t handle,
368
+ const cudnnReduceTensorDescriptor_t reduceTensorDesc,
369
+ void *indices,
370
+ size_t indicesSizeInBytes,
371
+ void *workspace,
372
+ size_t workspaceSizeInBytes,
373
+ const void *alpha,
374
+ const cudnnTensorDescriptor_t aDesc,
375
+ const void *A,
376
+ const void *beta,
377
+ const cudnnTensorDescriptor_t cDesc,
378
+ void *C);
379
+
380
+ /* Set all values of a tensor to a given value : y[i] = value[0] */
381
+ cudnnStatus_t CUDNNWINAPI
382
+ cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
383
+
384
+ /* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
385
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
386
+ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
387
+
388
+ /* Create an instance of FilterStruct */
389
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
390
+ cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
391
+
392
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
393
+ cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
394
+ cudnnDataType_t dataType, /* image data type */
395
+ cudnnTensorFormat_t format,
396
+ int k, /* number of output feature maps */
397
+ int c, /* number of input feature maps */
398
+ int h, /* height of each input filter */
399
+ int w); /* width of each input filter */
400
+
401
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
402
+ cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
403
+ cudnnDataType_t *dataType, /* image data type */
404
+ cudnnTensorFormat_t *format,
405
+ int *k, /* number of output feature maps */
406
+ int *c, /* number of input feature maps */
407
+ int *h, /* height of each input filter */
408
+ int *w); /* width of each input filter */
409
+
410
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
411
+ cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
412
+ cudnnDataType_t dataType, /* image data type */
413
+ cudnnTensorFormat_t format,
414
+ int nbDims,
415
+ const int filterDimA[]);
416
+
417
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
418
+ cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
419
+ int nbDimsRequested,
420
+ cudnnDataType_t *dataType, /* image data type */
421
+ cudnnTensorFormat_t *format,
422
+ int *nbDims,
423
+ int filterDimA[]);
424
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
425
+ cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
426
+
427
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
428
+ cudnnTransformFilter(cudnnHandle_t handle,
429
+ const cudnnTensorTransformDescriptor_t transDesc,
430
+ const void *alpha,
431
+ const cudnnFilterDescriptor_t srcDesc,
432
+ const void *srcData,
433
+ const void *beta,
434
+ const cudnnFilterDescriptor_t destDesc,
435
+ void *destData);
436
+
437
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
438
+ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
439
+
440
+ /*
441
+ * softmax algorithm
442
+ */
443
+ typedef enum {
444
+ CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */
445
+ CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
446
+ CUDNN_SOFTMAX_LOG = 2
447
+ } cudnnSoftmaxAlgorithm_t;
448
+
449
+ typedef enum {
450
+ CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
451
+ CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */
452
+ } cudnnSoftmaxMode_t;
453
+
454
+ /* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
455
+
456
+ /* Function to perform forward softmax */
457
+ cudnnStatus_t CUDNNWINAPI
458
+ cudnnSoftmaxForward(cudnnHandle_t handle,
459
+ cudnnSoftmaxAlgorithm_t algo,
460
+ cudnnSoftmaxMode_t mode,
461
+ const void *alpha,
462
+ const cudnnTensorDescriptor_t xDesc,
463
+ const void *x,
464
+ const void *beta,
465
+ const cudnnTensorDescriptor_t yDesc,
466
+ void *y);
467
+
468
+ /*
469
+ * pooling mode
470
+ */
471
+ typedef enum {
472
+ CUDNN_POOLING_MAX = 0,
473
+ CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
474
+ CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
475
+ CUDNN_POOLING_MAX_DETERMINISTIC = 3
476
+ } cudnnPoolingMode_t CUDNN_DEPRECATED;
477
+
478
+ /* Create an instance of pooling descriptor */
479
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
480
+ cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
481
+
482
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
483
+ cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
484
+ cudnnPoolingMode_t mode,
485
+ cudnnNanPropagation_t maxpoolingNanOpt,
486
+ int windowHeight,
487
+ int windowWidth,
488
+ int verticalPadding,
489
+ int horizontalPadding,
490
+ int verticalStride,
491
+ int horizontalStride);
492
+
493
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
494
+ cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
495
+ cudnnPoolingMode_t *mode,
496
+ cudnnNanPropagation_t *maxpoolingNanOpt,
497
+ int *windowHeight,
498
+ int *windowWidth,
499
+ int *verticalPadding,
500
+ int *horizontalPadding,
501
+ int *verticalStride,
502
+ int *horizontalStride);
503
+
504
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
505
+ cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
506
+ const cudnnPoolingMode_t mode,
507
+ const cudnnNanPropagation_t maxpoolingNanOpt,
508
+ int nbDims,
509
+ const int windowDimA[],
510
+ const int paddingA[],
511
+ const int strideA[]);
512
+
513
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
514
+ cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
515
+ int nbDimsRequested,
516
+ cudnnPoolingMode_t *mode,
517
+ cudnnNanPropagation_t *maxpoolingNanOpt,
518
+ int *nbDims,
519
+ int windowDimA[],
520
+ int paddingA[],
521
+ int strideA[]);
522
+
523
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
524
+ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
525
+ const cudnnTensorDescriptor_t inputTensorDesc,
526
+ int nbDims,
527
+ int outputTensorDimA[]);
528
+
529
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
530
+ cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
531
+ const cudnnTensorDescriptor_t inputTensorDesc,
532
+ int *n,
533
+ int *c,
534
+ int *h,
535
+ int *w);
536
+
537
+ /* Destroy an instance of pooling descriptor */
538
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
539
+ cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
540
+
541
+ /* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
542
+
543
+ /* Function to perform forward pooling */
544
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
545
+ cudnnPoolingForward(cudnnHandle_t handle,
546
+ const cudnnPoolingDescriptor_t poolingDesc,
547
+ const void *alpha,
548
+ const cudnnTensorDescriptor_t xDesc,
549
+ const void *x,
550
+ const void *beta,
551
+ const cudnnTensorDescriptor_t yDesc,
552
+ void *y);
553
+
554
+ /* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
555
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
556
+ cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
557
+
558
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
559
+ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
560
+ cudnnActivationMode_t mode,
561
+ cudnnNanPropagation_t reluNanOpt,
562
+ double coef); /* ceiling for clipped RELU, alpha for ELU */
563
+
564
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
565
+ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
566
+ cudnnActivationMode_t *mode,
567
+ cudnnNanPropagation_t *reluNanOpt,
568
+ double *coef); /* ceiling for clipped RELU, alpha for ELU */
569
+
570
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
571
+ cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
572
+
573
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
574
+ cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
575
+
576
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
577
+ cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
578
+
579
+ /* Function to perform forward activation */
580
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
581
+ cudnnActivationForward(cudnnHandle_t handle,
582
+ cudnnActivationDescriptor_t activationDesc,
583
+ const void *alpha,
584
+ const cudnnTensorDescriptor_t xDesc,
585
+ const void *x,
586
+ const void *beta,
587
+ const cudnnTensorDescriptor_t yDesc,
588
+ void *y);
589
+
590
+ /*
591
+ * Create an instance of LRN (Local Response Normalization) descriptor
592
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
593
+ */
594
+ cudnnStatus_t CUDNNWINAPI
595
+ cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
596
+
597
+ #define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */
598
+ #define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */
599
+ #define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */
600
+ #define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
601
+
602
+ /* LRN layer mode */
603
+ typedef enum {
604
+ CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
605
+ } cudnnLRNMode_t;
606
+
607
+ /*
608
+ * Uses a window [center-lookBehind, center+lookAhead], where
609
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
610
+ * Values of double parameters cast to tensor data type.
611
+ */
612
+ cudnnStatus_t CUDNNWINAPI
613
+ cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
614
+ /*
615
+ * Retrieve the settings currently stored in an LRN layer descriptor
616
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
617
+ */
618
+ cudnnStatus_t CUDNNWINAPI
619
+ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
620
+
621
+ /* Destroy an instance of LRN descriptor */
622
+ cudnnStatus_t CUDNNWINAPI
623
+ cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
624
+
625
+ /* LRN functions: output = alpha * normalize(x) + beta * old_y */
626
+
627
+ /* LRN cross-channel forward computation. Double parameters cast to tensor data type */
628
+ cudnnStatus_t CUDNNWINAPI
629
+ cudnnLRNCrossChannelForward(cudnnHandle_t handle,
630
+ cudnnLRNDescriptor_t normDesc,
631
+ cudnnLRNMode_t lrnMode,
632
+ const void *alpha,
633
+ const cudnnTensorDescriptor_t xDesc,
634
+ const void *x,
635
+ const void *beta,
636
+ const cudnnTensorDescriptor_t yDesc,
637
+ void *y);
638
+
639
+ typedef enum {
640
+ CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
641
+ } cudnnDivNormMode_t;
642
+
643
+ /* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
644
+ cudnnStatus_t CUDNNWINAPI
645
+ cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
646
+ cudnnLRNDescriptor_t normDesc,
647
+ cudnnDivNormMode_t mode,
648
+ const void *alpha,
649
+ const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
650
+ const void *x,
651
+ const void *means, /* if NULL, means are assumed to be zero */
652
+ void *temp,
653
+ void *temp2,
654
+ const void *beta,
655
+ const cudnnTensorDescriptor_t yDesc,
656
+ void *y);
657
+
658
+ typedef enum {
659
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
660
+ CUDNN_BATCHNORM_PER_ACTIVATION = 0,
661
+
662
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
663
+ CUDNN_BATCHNORM_SPATIAL = 1,
664
+
665
+ /*
666
+ * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
667
+ * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
668
+ */
669
+ CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
670
+ } cudnnBatchNormMode_t CUDNN_DEPRECATED;
671
+
672
+ #define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
673
+
674
+ /*
675
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
676
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
677
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
678
+ */
679
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
680
+ cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
681
+ const cudnnTensorDescriptor_t xDesc,
682
+ cudnnBatchNormMode_t mode);
683
+
684
+ typedef enum {
685
+ CUDNN_BATCHNORM_OPS_BN = 0, /* do batch normalization only */
686
+ CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1, /* do batchNorm, then activation */
687
+ CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
688
+ } cudnnBatchNormOps_t CUDNN_DEPRECATED;
689
+
690
+ /*
691
+ * Performs Batch Normalization during Inference:
692
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
693
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
694
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
695
+ * above for notes on function arguments.
696
+ */
697
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
698
+ cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
699
+ cudnnBatchNormMode_t mode,
700
+ const void *alpha, /* alpha[0] = result blend factor */
701
+ const void *beta, /* beta[0] = dest layer blend factor */
702
+ const cudnnTensorDescriptor_t xDesc,
703
+ const void *x, /* NxCxHxW */
704
+ const cudnnTensorDescriptor_t yDesc,
705
+ void *y, /* NxCxHxW */
706
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
707
+ const void *bnScale,
708
+ const void *bnBias,
709
+ const void *estimatedMean,
710
+ const void *estimatedVariance,
711
+ double epsilon);
712
+
713
+ typedef enum {
714
+ /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
715
+ CUDNN_NORM_PER_ACTIVATION = 0,
716
+
717
+ /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
718
+ CUDNN_NORM_PER_CHANNEL = 1,
719
+ } cudnnNormMode_t CUDNN_DEPRECATED;
720
+
721
+ typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
722
+
723
+ /*
724
+ * Derives a tensor descriptor from layer data descriptor for Normalization
725
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
726
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
727
+ */
728
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
729
+ cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
730
+ cudnnTensorDescriptor_t derivedNormMeanVarDesc,
731
+ const cudnnTensorDescriptor_t xDesc,
732
+ cudnnNormMode_t mode,
733
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
734
+
735
+ typedef enum {
736
+ CUDNN_NORM_OPS_NORM = 0, /* do normalization only */
737
+ CUDNN_NORM_OPS_NORM_ACTIVATION = 1, /* do Norm, then activation */
738
+ CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
739
+ } cudnnNormOps_t CUDNN_DEPRECATED;
740
+
741
+ /*
742
+ * Performs Normalization during Inference:
743
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
744
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
745
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
746
+ * above for notes on function arguments.
747
+ */
748
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
749
+ cudnnNormalizationForwardInference(cudnnHandle_t handle,
750
+ cudnnNormMode_t mode,
751
+ cudnnNormOps_t normOps,
752
+ cudnnNormAlgo_t algo,
753
+ const void *alpha, /* alpha[0] = result blend factor */
754
+ const void *beta, /* beta[0] = dest layer blend factor */
755
+ const cudnnTensorDescriptor_t xDesc,
756
+ const void *x, /* NxCxHxW */
757
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
758
+ const void *normScale,
759
+ const void *normBias,
760
+ const cudnnTensorDescriptor_t normMeanVarDesc,
761
+ const void *estimatedMean,
762
+ const void *estimatedVariance,
763
+ const cudnnTensorDescriptor_t zDesc,
764
+ const void *z,
765
+ cudnnActivationDescriptor_t activationDesc,
766
+ const cudnnTensorDescriptor_t yDesc,
767
+ void *y, /* NxCxHxW */
768
+ double epsilon,
769
+ int groupCnt); /* Place hold for future work*/
770
+
771
+ /* APIs for spatial transformer network*/
772
+ typedef enum {
773
+ CUDNN_SAMPLER_BILINEAR = 0,
774
+ } cudnnSamplerType_t;
775
+
776
+ cudnnStatus_t CUDNNWINAPI
777
+ cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
778
+
779
+ cudnnStatus_t CUDNNWINAPI
780
+ cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
781
+ cudnnSamplerType_t samplerType,
782
+ cudnnDataType_t dataType,
783
+ const int nbDims,
784
+ const int dimA[]);
785
+
786
+ cudnnStatus_t CUDNNWINAPI
787
+ cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
788
+
789
+ cudnnStatus_t CUDNNWINAPI
790
+ cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
791
+ const cudnnSpatialTransformerDescriptor_t stDesc,
792
+ const void *theta,
793
+ void *grid);
794
+
795
+ cudnnStatus_t CUDNNWINAPI
796
+ cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
797
+ cudnnSpatialTransformerDescriptor_t stDesc,
798
+ const void *alpha,
799
+ const cudnnTensorDescriptor_t xDesc,
800
+ const void *x,
801
+ const void *grid,
802
+ const void *beta,
803
+ cudnnTensorDescriptor_t yDesc,
804
+ void *y);
805
+
806
+ typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
807
+
808
+ cudnnStatus_t CUDNNWINAPI
809
+ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
810
+
811
+ cudnnStatus_t CUDNNWINAPI
812
+ cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
813
+
814
+ /*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
815
+ cudnnStatus_t CUDNNWINAPI
816
+ cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
817
+
818
+ /*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
819
+ cudnnStatus_t CUDNNWINAPI
820
+ cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
821
+
822
+ cudnnStatus_t CUDNNWINAPI
823
+ cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
824
+ cudnnHandle_t handle,
825
+ float dropout,
826
+ void *states,
827
+ size_t stateSizeInBytes,
828
+ unsigned long long seed);
829
+
830
+ /* Restores the dropout descriptor to a previously saved-off state */
831
+ cudnnStatus_t CUDNNWINAPI
832
+ cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
833
+ cudnnHandle_t handle,
834
+ float dropout,
835
+ void *states,
836
+ size_t stateSizeInBytes,
837
+ unsigned long long seed);
838
+
839
+ cudnnStatus_t CUDNNWINAPI
840
+ cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
841
+ cudnnHandle_t handle,
842
+ float *dropout,
843
+ void **states,
844
+ unsigned long long *seed);
845
+
846
+ cudnnStatus_t CUDNNWINAPI
847
+ cudnnDropoutForward(cudnnHandle_t handle,
848
+ const cudnnDropoutDescriptor_t dropoutDesc,
849
+ const cudnnTensorDescriptor_t xdesc,
850
+ const void *x,
851
+ const cudnnTensorDescriptor_t ydesc,
852
+ void *y,
853
+ void *reserveSpace,
854
+ size_t reserveSpaceSizeInBytes);
855
+
856
+ /* TODO: move these enums out to the appropriate submodule */
857
+ typedef enum {
858
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
859
+ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
860
+ CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
861
+ CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
862
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
863
+ CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
864
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
865
+ CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
866
+ CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8
867
+ } cudnnConvolutionFwdAlgo_t;
868
+
869
+ typedef enum {
870
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */
871
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
872
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
873
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */
874
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */
875
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
876
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
877
+ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7
878
+ } cudnnConvolutionBwdFilterAlgo_t;
879
+
880
+ typedef enum {
881
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */
882
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
883
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
884
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
885
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
886
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
887
+ CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6
888
+ } cudnnConvolutionBwdDataAlgo_t;
889
+
890
+ typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
891
+
892
+ /*
893
+ * \brief Cross-library version checker.
894
+ * This function is implemented differently in each sub-library. Each sublib
895
+ * checks whether its own version matches that of its dependencies.
896
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
897
+ * CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
898
+ */
899
+ cudnnStatus_t CUDNNWINAPI
900
+ cudnnOpsVersionCheck(void);
901
+
902
+ /* Function to perform backward softmax */
903
+ cudnnStatus_t CUDNNWINAPI
904
+ cudnnSoftmaxBackward(cudnnHandle_t handle,
905
+ cudnnSoftmaxAlgorithm_t algo,
906
+ cudnnSoftmaxMode_t mode,
907
+ const void *alpha,
908
+ const cudnnTensorDescriptor_t yDesc,
909
+ const void *y,
910
+ const cudnnTensorDescriptor_t dyDesc,
911
+ const void *dy,
912
+ const void *beta,
913
+ const cudnnTensorDescriptor_t dxDesc,
914
+ void *dx);
915
+
916
+ /* Function to perform backward pooling */
917
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
918
+ cudnnPoolingBackward(cudnnHandle_t handle,
919
+ const cudnnPoolingDescriptor_t poolingDesc,
920
+ const void *alpha,
921
+ const cudnnTensorDescriptor_t yDesc,
922
+ const void *y,
923
+ const cudnnTensorDescriptor_t dyDesc,
924
+ const void *dy,
925
+ const cudnnTensorDescriptor_t xDesc,
926
+ const void *x,
927
+ const void *beta,
928
+ const cudnnTensorDescriptor_t dxDesc,
929
+ void *dx);
930
+
931
+ /* Function to perform backward activation */
932
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
933
+ cudnnActivationBackward(cudnnHandle_t handle,
934
+ cudnnActivationDescriptor_t activationDesc,
935
+ const void *alpha,
936
+ const cudnnTensorDescriptor_t yDesc,
937
+ const void *y,
938
+ const cudnnTensorDescriptor_t dyDesc,
939
+ const void *dy,
940
+ const cudnnTensorDescriptor_t xDesc,
941
+ const void *x,
942
+ const void *beta,
943
+ const cudnnTensorDescriptor_t dxDesc,
944
+ void *dx);
945
+
946
+ /* LRN cross-channel backward computation. Double parameters cast to tensor data type */
947
+ cudnnStatus_t CUDNNWINAPI
948
+ cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
949
+ cudnnLRNDescriptor_t normDesc,
950
+ cudnnLRNMode_t lrnMode,
951
+ const void *alpha,
952
+ const cudnnTensorDescriptor_t yDesc,
953
+ const void *y,
954
+ const cudnnTensorDescriptor_t dyDesc,
955
+ const void *dy,
956
+ const cudnnTensorDescriptor_t xDesc,
957
+ const void *x,
958
+ const void *beta,
959
+ const cudnnTensorDescriptor_t dxDesc,
960
+ void *dx);
961
+
962
+ cudnnStatus_t CUDNNWINAPI
963
+ cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
964
+ cudnnLRNDescriptor_t normDesc,
965
+ cudnnDivNormMode_t mode,
966
+ const void *alpha,
967
+ const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
968
+ const void *x,
969
+ const void *means, /* if NULL, means are assumed to be zero */
970
+ const void *dy,
971
+ void *temp,
972
+ void *temp2,
973
+ const void *beta,
974
+ const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
975
+ void *dx, /* output x differential */
976
+ void *dMeans); /* output means differential, can be NULL */
977
+
978
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
979
+ cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
980
+ cudnnBatchNormMode_t mode,
981
+ cudnnBatchNormOps_t bnOps,
982
+ const cudnnTensorDescriptor_t xDesc,
983
+ const cudnnTensorDescriptor_t zDesc,
984
+ const cudnnTensorDescriptor_t yDesc,
985
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
986
+ const cudnnActivationDescriptor_t activationDesc,
987
+ size_t *sizeInBytes);
988
+
989
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
990
+ cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
991
+ cudnnBatchNormMode_t mode,
992
+ cudnnBatchNormOps_t bnOps,
993
+ const cudnnTensorDescriptor_t xDesc,
994
+ const cudnnTensorDescriptor_t yDesc,
995
+ const cudnnTensorDescriptor_t dyDesc,
996
+ const cudnnTensorDescriptor_t dzDesc,
997
+ const cudnnTensorDescriptor_t dxDesc,
998
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
999
+ const cudnnActivationDescriptor_t activationDesc,
1000
+ size_t *sizeInBytes);
1001
+
1002
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1003
+ cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
1004
+ cudnnBatchNormMode_t mode,
1005
+ cudnnBatchNormOps_t bnOps,
1006
+ const cudnnActivationDescriptor_t activationDesc,
1007
+ const cudnnTensorDescriptor_t xDesc,
1008
+ size_t *sizeInBytes);
1009
+
1010
+ /* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
1011
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1012
+ cudnnBatchNormalizationForwardTraining(
1013
+ cudnnHandle_t handle,
1014
+ cudnnBatchNormMode_t mode,
1015
+
1016
+ const void *alpha, /* alpha[0] = result blend factor */
1017
+ const void *beta, /* beta[0] = dest layer blend factor */
1018
+
1019
+ const cudnnTensorDescriptor_t xDesc,
1020
+ const void *x, /* NxCxHxW */
1021
+ const cudnnTensorDescriptor_t yDesc,
1022
+ void *y, /* NxCxHxW */
1023
+
1024
+ /* Shared desc for the next 6 tensors in the argument list.
1025
+ Data type to be set as follows:
1026
+ type = (typeOf(x) == double) ? double : float
1027
+ Dimensions for this descriptor depend on normalization mode
1028
+ - Spatial Normalization : tensors are expected to have dims 1xCx1x1
1029
+ (normalization is performed across NxHxW)
1030
+ - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
1031
+ (normalization is performed across N) */
1032
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
1033
+
1034
+ /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
1035
+ const void *bnScale,
1036
+ const void *bnBias,
1037
+
1038
+ /* MUST use factor=1 in the very first call of a complete training cycle.
1039
+ Use a factor=1/(1+n) at N-th call to the function to get
1040
+ Cumulative Moving Average (CMA) behavior
1041
+ CMA[n] = (x[1]+...+x[n])/n
1042
+ Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
1043
+ ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
1044
+ CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
1045
+ double exponentialAverageFactor,
1046
+
1047
+ /* Used in Training phase only.
1048
+ runningMean = newMean*factor + runningMean*(1-factor) */
1049
+ void *resultRunningMean,
1050
+ /* Output in training mode, input in inference. Is the moving average
1051
+ of variance[x] (factor is applied in the same way as for runningMean) */
1052
+ void *resultRunningVariance,
1053
+
1054
+ /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
1055
+ double epsilon,
1056
+
1057
+ /* Optionally save intermediate results from the forward pass here
1058
+ - can be reused to speed up backward pass. NULL if unused */
1059
+ void *resultSaveMean,
1060
+ void *resultSaveInvVariance);
1061
+
1062
+ /* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
1063
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1064
+ cudnnBatchNormalizationForwardTrainingEx(
1065
+ cudnnHandle_t handle,
1066
+ cudnnBatchNormMode_t mode,
1067
+ cudnnBatchNormOps_t bnOps,
1068
+
1069
+ const void *alpha, /* alpha[0] = result blend factor */
1070
+ const void *beta, /* beta[0] = dest layer blend factor */
1071
+
1072
+ const cudnnTensorDescriptor_t xDesc,
1073
+ const void *xData,
1074
+ const cudnnTensorDescriptor_t zDesc,
1075
+ const void *zData,
1076
+ const cudnnTensorDescriptor_t yDesc,
1077
+ void *yData,
1078
+
1079
+ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
1080
+ const void *bnScale,
1081
+ const void *bnBias,
1082
+
1083
+ double exponentialAverageFactor,
1084
+ void *resultRunningMean,
1085
+ void *resultRunningVariance,
1086
+
1087
+ /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
1088
+ double epsilon,
1089
+
1090
+ /* Optionally save intermediate results from the forward pass here
1091
+ - can be reused to speed up backward pass. NULL if unused */
1092
+ void *resultSaveMean,
1093
+ void *resultSaveInvVariance,
1094
+
1095
+ cudnnActivationDescriptor_t activationDesc,
1096
+ void *workspace,
1097
+ size_t workSpaceSizeInBytes,
1098
+ void *reserveSpace,
1099
+ size_t reserveSpaceSizeInBytes);
1100
+
1101
+ /* Performs backward pass of Batch Normalization layer. Returns x gradient,
1102
+ * bnScale gradient and bnBias gradient */
1103
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1104
+ cudnnBatchNormalizationBackward(cudnnHandle_t handle,
1105
+ cudnnBatchNormMode_t mode,
1106
+ const void *alphaDataDiff,
1107
+ const void *betaDataDiff,
1108
+ const void *alphaParamDiff,
1109
+ const void *betaParamDiff,
1110
+ const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
1111
+ const void *x,
1112
+ const cudnnTensorDescriptor_t dyDesc,
1113
+ const void *dy,
1114
+ const cudnnTensorDescriptor_t dxDesc,
1115
+ void *dx,
1116
+ /* Shared tensor desc for the 4 tensors below */
1117
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
1118
+ const void *bnScale, /* bnBias doesn't affect backpropagation */
1119
+ /* scale and bias diff are not backpropagated below this layer */
1120
+ void *dBnScaleResult,
1121
+ void *dBnBiasResult,
1122
+ /* Same epsilon as forward pass */
1123
+ double epsilon,
1124
+
1125
+ /* Optionally cached intermediate results from
1126
+ forward pass */
1127
+ const void *savedMean,
1128
+ const void *savedInvVariance);
1129
+
1130
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1131
+ cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
1132
+ cudnnBatchNormMode_t mode,
1133
+ cudnnBatchNormOps_t bnOps,
1134
+
1135
+ const void *alphaDataDiff,
1136
+ const void *betaDataDiff,
1137
+ const void *alphaParamDiff,
1138
+ const void *betaParamDiff,
1139
+ const cudnnTensorDescriptor_t xDesc,
1140
+ const void *xData,
1141
+ const cudnnTensorDescriptor_t yDesc,
1142
+ const void *yData,
1143
+ const cudnnTensorDescriptor_t dyDesc,
1144
+ const void *dyData,
1145
+ const cudnnTensorDescriptor_t dzDesc,
1146
+ void *dzData,
1147
+ const cudnnTensorDescriptor_t dxDesc,
1148
+ void *dxData,
1149
+
1150
+ /* Shared tensor desc for the 4 tensors below */
1151
+ const cudnnTensorDescriptor_t dBnScaleBiasDesc,
1152
+ const void *bnScaleData,
1153
+ const void *bnBiasData, /* needed if there is activation */
1154
+ void *dBnScaleData,
1155
+ void *dBnBiasData,
1156
+ double epsilon, /* Same epsilon as forward pass */
1157
+
1158
+ /* Optionally cached intermediate results from
1159
+ forward pass */
1160
+ const void *savedMean,
1161
+ const void *savedInvVariance,
1162
+ cudnnActivationDescriptor_t activationDesc,
1163
+ void *workSpace,
1164
+ size_t workSpaceSizeInBytes,
1165
+ void *reserveSpace,
1166
+ size_t reserveSpaceSizeInBytes);
1167
+
1168
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1169
+ cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
1170
+ cudnnNormMode_t mode,
1171
+ cudnnNormOps_t normOps,
1172
+ cudnnNormAlgo_t algo,
1173
+ const cudnnTensorDescriptor_t xDesc,
1174
+ const cudnnTensorDescriptor_t zDesc,
1175
+ const cudnnTensorDescriptor_t yDesc,
1176
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
1177
+ const cudnnActivationDescriptor_t activationDesc,
1178
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1179
+ size_t *sizeInBytes,
1180
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1181
+
1182
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1183
+ cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
1184
+ cudnnNormMode_t mode,
1185
+ cudnnNormOps_t normOps,
1186
+ cudnnNormAlgo_t algo,
1187
+ const cudnnTensorDescriptor_t xDesc,
1188
+ const cudnnTensorDescriptor_t yDesc,
1189
+ const cudnnTensorDescriptor_t dyDesc,
1190
+ const cudnnTensorDescriptor_t dzDesc,
1191
+ const cudnnTensorDescriptor_t dxDesc,
1192
+ const cudnnTensorDescriptor_t dNormScaleBiasDesc,
1193
+ const cudnnActivationDescriptor_t activationDesc,
1194
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1195
+ size_t *sizeInBytes,
1196
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1197
+
1198
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1199
+ cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
1200
+ cudnnNormMode_t mode,
1201
+ cudnnNormOps_t normOps,
1202
+ cudnnNormAlgo_t algo,
1203
+ const cudnnActivationDescriptor_t activationDesc,
1204
+ const cudnnTensorDescriptor_t xDesc,
1205
+ size_t *sizeInBytes,
1206
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1207
+
1208
+ /* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
1209
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1210
+ cudnnNormalizationForwardTraining(cudnnHandle_t handle,
1211
+ cudnnNormMode_t mode,
1212
+ cudnnNormOps_t normOps,
1213
+ cudnnNormAlgo_t algo,
1214
+ const void *alpha, /* alpha[0] = result blend factor */
1215
+ const void *beta, /* beta[0] = dest layer blend factor */
1216
+ const cudnnTensorDescriptor_t xDesc,
1217
+ const void *xData,
1218
+ const cudnnTensorDescriptor_t normScaleBiasDesc,
1219
+ const void *normScale,
1220
+ const void *normBias,
1221
+ double exponentialAverageFactor,
1222
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1223
+ void *resultRunningMean,
1224
+ void *resultRunningVariance,
1225
+ /* Has to be >= 0. Should be the same in forward and backward functions. */
1226
+ double epsilon,
1227
+ /* Optionally save intermediate results from the forward pass here
1228
+ - can be reused to speed up backward pass. NULL if unused */
1229
+ void *resultSaveMean,
1230
+ void *resultSaveInvVariance,
1231
+ cudnnActivationDescriptor_t activationDesc,
1232
+ const cudnnTensorDescriptor_t zDesc,
1233
+ const void *zData,
1234
+ const cudnnTensorDescriptor_t yDesc,
1235
+ void *yData,
1236
+ void *workspace,
1237
+ size_t workSpaceSizeInBytes,
1238
+ void *reserveSpace,
1239
+ size_t reserveSpaceSizeInBytes,
1240
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1241
+
1242
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
1243
+ cudnnNormalizationBackward(cudnnHandle_t handle,
1244
+ cudnnNormMode_t mode,
1245
+ cudnnNormOps_t normOps,
1246
+ cudnnNormAlgo_t algo,
1247
+ const void *alphaDataDiff,
1248
+ const void *betaDataDiff,
1249
+ const void *alphaParamDiff,
1250
+ const void *betaParamDiff,
1251
+ const cudnnTensorDescriptor_t xDesc,
1252
+ const void *xData,
1253
+ const cudnnTensorDescriptor_t yDesc,
1254
+ const void *yData,
1255
+ const cudnnTensorDescriptor_t dyDesc,
1256
+ const void *dyData,
1257
+ const cudnnTensorDescriptor_t dzDesc,
1258
+ void *dzData,
1259
+ const cudnnTensorDescriptor_t dxDesc,
1260
+ void *dxData,
1261
+ /* Shared tensor desc for the 4 tensors below */
1262
+ const cudnnTensorDescriptor_t dNormScaleBiasDesc,
1263
+ const void *normScaleData,
1264
+ const void *normBiasData, /* needed if there is activation */
1265
+ void *dNormScaleData,
1266
+ void *dNormBiasData,
1267
+ double epsilon, /* Same epsilon as forward pass */
1268
+ const cudnnTensorDescriptor_t normMeanVarDesc,
1269
+ /* Optionally cached intermediate results from
1270
+ forward pass */
1271
+ const void *savedMean,
1272
+ const void *savedInvVariance,
1273
+ cudnnActivationDescriptor_t activationDesc,
1274
+ void *workSpace,
1275
+ size_t workSpaceSizeInBytes,
1276
+ void *reserveSpace,
1277
+ size_t reserveSpaceSizeInBytes,
1278
+ int groupCnt); /* Place hold for future work, should be set to 1 now*/
1279
+
1280
+ cudnnStatus_t CUDNNWINAPI
1281
+ cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
1282
+ const cudnnSpatialTransformerDescriptor_t stDesc,
1283
+ const void *dgrid,
1284
+ void *dtheta);
1285
+
1286
+ cudnnStatus_t CUDNNWINAPI
1287
+ cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
1288
+ cudnnSpatialTransformerDescriptor_t stDesc,
1289
+ const void *alpha,
1290
+ const cudnnTensorDescriptor_t xDesc,
1291
+ const void *x,
1292
+ const void *beta,
1293
+ const cudnnTensorDescriptor_t dxDesc,
1294
+ void *dx,
1295
+ const void *alphaDgrid,
1296
+ const cudnnTensorDescriptor_t dyDesc,
1297
+ const void *dy,
1298
+ const void *grid,
1299
+ const void *betaDgrid,
1300
+ void *dgrid);
1301
+
1302
+ cudnnStatus_t CUDNNWINAPI
1303
+ cudnnDropoutBackward(cudnnHandle_t handle,
1304
+ const cudnnDropoutDescriptor_t dropoutDesc,
1305
+ const cudnnTensorDescriptor_t dydesc,
1306
+ const void *dy,
1307
+ const cudnnTensorDescriptor_t dxdesc,
1308
+ void *dx,
1309
+ void *reserveSpace,
1310
+ size_t reserveSpaceSizeInBytes);
1311
+
1312
+ #if defined(__cplusplus)
1313
+ }
1314
+ #endif
1315
+
1316
+ #endif /* CUDNN_OPS_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn : Neural Networks Library */
51
+
52
+ #if !defined(CUDNN_H_)
53
+ #define CUDNN_H_
54
+ #if defined(__cplusplus)
55
+ extern "C" {
56
+ #endif
57
+
58
+ #include <cuda_runtime_api.h>
59
+ #include "cudnn_version.h"
60
+ #include "cudnn_graph.h"
61
+ #include "cudnn_ops.h"
62
+ #include "cudnn_adv.h"
63
+ #include "cudnn_cnn.h"
64
+
65
+ #if defined(__cplusplus)
66
+ }
67
+ #endif
68
+ #endif /* CUDNN_H_ */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /**
51
+ * \file: The master cuDNN version file.
52
+ */
53
+
54
+ #ifndef CUDNN_VERSION_H_
55
+ #define CUDNN_VERSION_H_
56
+
57
+ #define CUDNN_MAJOR 9
58
+ #define CUDNN_MINOR 10
59
+ #define CUDNN_PATCHLEVEL 2
60
+
61
+ #define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
62
+
63
+ /* cannot use constexpr here since this is a C-only file */
64
+ /* Below is the max SM version this cuDNN library is aware of and supports natively */
65
+
66
+ #define CUDNN_MAX_SM_MAJOR_NUMBER 12
67
+ #define CUDNN_MAX_SM_MINOR_NUMBER 0
68
+ #define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
69
+
70
+ #endif /* CUDNN_VERSION_H */
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2014-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /**
51
+ * \file: The master cuDNN version file.
52
+ */
53
+
54
+ #ifndef CUDNN_VERSION_H_
55
+ #define CUDNN_VERSION_H_
56
+
57
+ #define CUDNN_MAJOR 9
58
+ #define CUDNN_MINOR 10
59
+ #define CUDNN_PATCHLEVEL 2
60
+
61
+ #define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
62
+
63
+ /* cannot use constexpr here since this is a C-only file */
64
+ /* Below is the max SM version this cuDNN library is aware of and supports natively */
65
+
66
+ #define CUDNN_MAX_SM_MAJOR_NUMBER 12
67
+ #define CUDNN_MAX_SM_MINOR_NUMBER 0
68
+ #define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
69
+
70
+ #endif /* CUDNN_VERSION_H */
.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions
6
+ are met:
7
+ * Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+ * Redistributions in binary form must reproduce the above copyright
10
+ notice, this list of conditions and the following disclaimer in the
11
+ documentation and/or other materials provided with the distribution.
12
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
13
+ Laboratory, the U.S. Department of Energy, nor the names of their
14
+ contributors may be used to endorse or promote products derived
15
+ from this software without specific prior written permission.
16
+
17
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ The U.S. Department of Energy funded the development of this software
30
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
31
+
32
+
33
+ This code also includes files from the NVIDIA Tools Extension SDK project.
34
+
35
+ See:
36
+
37
+ https://github.com/NVIDIA/NVTX
38
+
39
+ for more information and license details.
.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module to give helpful messages to the user that did not
2
+ compile scikit-learn properly.
3
+ """
4
+
5
+ # Authors: The scikit-learn developers
6
+ # SPDX-License-Identifier: BSD-3-Clause
7
+
8
+ import os
9
+
10
+ INPLACE_MSG = """
11
+ It appears that you are importing a local scikit-learn source tree. For
12
+ this, you need to have an inplace install. Maybe you are in the source
13
+ directory and you need to try from another location."""
14
+
15
+ STANDARD_MSG = """
16
+ If you have used an installer, please check that it is suited for your
17
+ Python version, your operating system and your platform."""
18
+
19
+
20
+ def raise_build_error(e):
21
+ # Raise a comprehensible error and list the contents of the
22
+ # directory to help debugging on the mailing list.
23
+ local_dir = os.path.split(__file__)[0]
24
+ msg = STANDARD_MSG
25
+ if local_dir == "sklearn/__check_build":
26
+ # Picking up the local install: this will work only if the
27
+ # install is an 'inplace build'
28
+ msg = INPLACE_MSG
29
+ dir_content = list()
30
+ for i, filename in enumerate(os.listdir(local_dir)):
31
+ if (i + 1) % 3:
32
+ dir_content.append(filename.ljust(26))
33
+ else:
34
+ dir_content.append(filename + "\n")
35
+ raise ImportError(
36
+ """%s
37
+ ___________________________________________________________________________
38
+ Contents of %s:
39
+ %s
40
+ ___________________________________________________________________________
41
+ It seems that scikit-learn has not been built correctly.
42
+
43
+ If you have installed scikit-learn from source, please do not forget
44
+ to build the package before using it. For detailed instructions, see:
45
+ https://scikit-learn.org/dev/developers/advanced_installation.html#building-from-source
46
+ %s"""
47
+ % (e, local_dir, "".join(dir_content).strip(), msg)
48
+ )
49
+
50
+
51
+ try:
52
+ from ._check_build import check_build # noqa: F401
53
+ except ImportError as e:
54
+ raise_build_error(e)
.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so ADDED
Binary file (45.3 kB). View file
 
.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ def check_build():
2
+ return
.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ py.extension_module(
2
+ '_check_build',
3
+ cython_gen.process('_check_build.pyx'),
4
+ install: true,
5
+ subdir: 'sklearn/__check_build',
6
+ )
.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (3.12 kB). View file
 
.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc ADDED
Binary file (193 Bytes). View file
 
.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc ADDED
Binary file (14.2 kB). View file
 
.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc ADDED
Binary file (550 Bytes). View file
 
.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc ADDED
Binary file (51.3 kB). View file
 
.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc ADDED
Binary file (9.49 kB). View file
 
.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py ADDED
File without changes
.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # Authors: The scikit-learn developers
4
+ # SPDX-License-Identifier: BSD-3-Clause
5
+
6
+ import argparse
7
+ import os
8
+
9
+ from Cython import Tempita as tempita
10
+
11
+ # XXX: If this import ever fails (does it really?), vendor either
12
+ # cython.tempita or numpy/npy_tempita.
13
+
14
+
15
+ def process_tempita(fromfile, outfile=None):
16
+ """Process tempita templated file and write out the result.
17
+
18
+ The template file is expected to end in `.c.tp` or `.pyx.tp`:
19
+ E.g. processing `template.c.in` generates `template.c`.
20
+
21
+ """
22
+ with open(fromfile, "r", encoding="utf-8") as f:
23
+ template_content = f.read()
24
+
25
+ template = tempita.Template(template_content)
26
+ content = template.substitute()
27
+
28
+ with open(outfile, "w", encoding="utf-8") as f:
29
+ f.write(content)
30
+
31
+
32
+ def main():
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("infile", type=str, help="Path to the input file")
35
+ parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
36
+ parser.add_argument(
37
+ "-i",
38
+ "--ignore",
39
+ type=str,
40
+ help=(
41
+ "An ignored input - may be useful to add a "
42
+ "dependency between custom targets"
43
+ ),
44
+ )
45
+ args = parser.parse_args()
46
+
47
+ if not args.infile.endswith(".tp"):
48
+ raise ValueError(f"Unexpected extension: {args.infile}")
49
+
50
+ if not args.outdir:
51
+ raise ValueError("Missing `--outdir` argument to tempita.py")
52
+
53
+ outdir_abs = os.path.join(os.getcwd(), args.outdir)
54
+ outfile = os.path.join(
55
+ outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
56
+ )
57
+
58
+ process_tempita(args.infile, outfile)
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract version number from __init__.py"""
3
+
4
+ # Authors: The scikit-learn developers
5
+ # SPDX-License-Identifier: BSD-3-Clause
6
+
7
+ import os
8
+
9
+ sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
10
+
11
+ data = open(sklearn_init).readlines()
12
+ version_line = next(line for line in data if line.startswith("__version__"))
13
+
14
+ version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
15
+
16
+ print(version)
.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The :mod:`sklearn._loss` module includes loss function classes suitable for
3
+ fitting classification and regression tasks.
4
+ """
5
+
6
+ # Authors: The scikit-learn developers
7
+ # SPDX-License-Identifier: BSD-3-Clause
8
+
9
+ from .loss import (
10
+ AbsoluteError,
11
+ HalfBinomialLoss,
12
+ HalfGammaLoss,
13
+ HalfMultinomialLoss,
14
+ HalfPoissonLoss,
15
+ HalfSquaredError,
16
+ HalfTweedieLoss,
17
+ HalfTweedieLossIdentity,
18
+ HuberLoss,
19
+ PinballLoss,
20
+ )
21
+
22
+ __all__ = [
23
+ "AbsoluteError",
24
+ "HalfBinomialLoss",
25
+ "HalfGammaLoss",
26
+ "HalfMultinomialLoss",
27
+ "HalfPoissonLoss",
28
+ "HalfSquaredError",
29
+ "HalfTweedieLoss",
30
+ "HalfTweedieLossIdentity",
31
+ "HuberLoss",
32
+ "PinballLoss",
33
+ ]
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fused types for input like y_true, raw_prediction, sample_weights.
2
+ ctypedef fused floating_in:
3
+ double
4
+ float
5
+
6
+
7
+ # Fused types for output like gradient and hessian
8
+ # We use a different fused types for input (floating_in) and output (floating_out), such
9
+ # that input and output can have different dtypes in the same function call. A single
10
+ # fused type can only take on one single value (type) for all arguments in one function
11
+ # call.
12
+ ctypedef fused floating_out:
13
+ double
14
+ float
15
+
16
+
17
+ # Struct to return 2 doubles
18
+ ctypedef struct double_pair:
19
+ double val1
20
+ double val2
21
+
22
+
23
+ # C base class for loss functions
24
+ cdef class CyLossFunction:
25
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
26
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
27
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
28
+
29
+
30
+ cdef class CyHalfSquaredError(CyLossFunction):
31
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
32
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
33
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
34
+
35
+
36
+ cdef class CyAbsoluteError(CyLossFunction):
37
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
38
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
39
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
40
+
41
+
42
+ cdef class CyPinballLoss(CyLossFunction):
43
+ cdef readonly double quantile # readonly makes it accessible from Python
44
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
45
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
46
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
47
+
48
+
49
+ cdef class CyHuberLoss(CyLossFunction):
50
+ cdef public double delta # public makes it accessible from Python
51
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
52
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
53
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
54
+
55
+
56
+ cdef class CyHalfPoissonLoss(CyLossFunction):
57
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
58
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
59
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
60
+
61
+
62
+ cdef class CyHalfGammaLoss(CyLossFunction):
63
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
64
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
65
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
66
+
67
+
68
+ cdef class CyHalfTweedieLoss(CyLossFunction):
69
+ cdef readonly double power # readonly makes it accessible from Python
70
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
71
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
72
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
73
+
74
+
75
+ cdef class CyHalfTweedieLossIdentity(CyLossFunction):
76
+ cdef readonly double power # readonly makes it accessible from Python
77
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
78
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
79
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
80
+
81
+
82
+ cdef class CyHalfBinomialLoss(CyLossFunction):
83
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
84
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
85
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
86
+
87
+
88
+ cdef class CyExponentialLoss(CyLossFunction):
89
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
90
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
91
+ cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
92
+
93
+
94
+ cdef class CyHalfMultinomialLoss():
95
+ cdef void cy_gradient(
96
+ self,
97
+ const floating_in y_true,
98
+ const floating_in[::1] raw_prediction,
99
+ const floating_in sample_weight,
100
+ floating_out[::1] gradient_out,
101
+ ) noexcept nogil
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp ADDED
@@ -0,0 +1,1505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{py:
2
+
3
+ """
4
+ Template file to easily generate loops over samples using Tempita
5
+ (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
6
+
7
+ Generated file: _loss.pyx
8
+
9
+ Each loss class is generated by a cdef functions on single samples.
10
+ The keywords between double braces are substituted during the build.
11
+ """
12
+
13
+ doc_HalfSquaredError = (
14
+ """Half Squared Error with identity link.
15
+
16
+ Domain:
17
+ y_true and y_pred all real numbers
18
+
19
+ Link:
20
+ y_pred = raw_prediction
21
+ """
22
+ )
23
+
24
+ doc_AbsoluteError = (
25
+ """Absolute Error with identity link.
26
+
27
+ Domain:
28
+ y_true and y_pred all real numbers
29
+
30
+ Link:
31
+ y_pred = raw_prediction
32
+ """
33
+ )
34
+
35
+ doc_PinballLoss = (
36
+ """Quantile Loss aka Pinball Loss with identity link.
37
+
38
+ Domain:
39
+ y_true and y_pred all real numbers
40
+ quantile in (0, 1)
41
+
42
+ Link:
43
+ y_pred = raw_prediction
44
+
45
+ Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
46
+ """
47
+ )
48
+
49
+ doc_HuberLoss = (
50
+ """Huber Loss with identity link.
51
+
52
+ Domain:
53
+ y_true and y_pred all real numbers
54
+ delta in positive real numbers
55
+
56
+ Link:
57
+ y_pred = raw_prediction
58
+ """
59
+ )
60
+
61
+ doc_HalfPoissonLoss = (
62
+ """Half Poisson deviance loss with log-link.
63
+
64
+ Domain:
65
+ y_true in non-negative real numbers
66
+ y_pred in positive real numbers
67
+
68
+ Link:
69
+ y_pred = exp(raw_prediction)
70
+
71
+ Half Poisson deviance with log-link is
72
+ y_true * log(y_true/y_pred) + y_pred - y_true
73
+ = y_true * log(y_true) - y_true * raw_prediction
74
+ + exp(raw_prediction) - y_true
75
+
76
+ Dropping constant terms, this gives:
77
+ exp(raw_prediction) - y_true * raw_prediction
78
+ """
79
+ )
80
+
81
+ doc_HalfGammaLoss = (
82
+ """Half Gamma deviance loss with log-link.
83
+
84
+ Domain:
85
+ y_true and y_pred in positive real numbers
86
+
87
+ Link:
88
+ y_pred = exp(raw_prediction)
89
+
90
+ Half Gamma deviance with log-link is
91
+ log(y_pred/y_true) + y_true/y_pred - 1
92
+ = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
93
+
94
+ Dropping constant terms, this gives:
95
+ raw_prediction + y_true * exp(-raw_prediction)
96
+ """
97
+ )
98
+
99
+ doc_HalfTweedieLoss = (
100
+ """Half Tweedie deviance loss with log-link.
101
+
102
+ Domain:
103
+ y_true in real numbers if p <= 0
104
+ y_true in non-negative real numbers if 0 < p < 2
105
+ y_true in positive real numbers if p >= 2
106
+ y_pred and power in positive real numbers
107
+
108
+ Link:
109
+ y_pred = exp(raw_prediction)
110
+
111
+ Half Tweedie deviance with log-link and p=power is
112
+ max(y_true, 0)**(2-p) / (1-p) / (2-p)
113
+ - y_true * y_pred**(1-p) / (1-p)
114
+ + y_pred**(2-p) / (2-p)
115
+ = max(y_true, 0)**(2-p) / (1-p) / (2-p)
116
+ - y_true * exp((1-p) * raw_prediction) / (1-p)
117
+ + exp((2-p) * raw_prediction) / (2-p)
118
+
119
+ Dropping constant terms, this gives:
120
+ exp((2-p) * raw_prediction) / (2-p)
121
+ - y_true * exp((1-p) * raw_prediction) / (1-p)
122
+
123
+ Notes:
124
+ - Poisson with p=1 and Gamma with p=2 have different terms dropped such
125
+ that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
126
+ - While the Tweedie distribution only exists for p<=0 or p>=1, the range
127
+ 0<p<1 still gives a strictly consistent scoring function for the
128
+ expectation.
129
+ """
130
+ )
131
+
132
+ doc_HalfTweedieLossIdentity = (
133
+ """Half Tweedie deviance loss with identity link.
134
+
135
+ Domain:
136
+ y_true in real numbers if p <= 0
137
+ y_true in non-negative real numbers if 0 < p < 2
138
+ y_true in positive real numbers if p >= 2
139
+ y_pred and power in positive real numbers, y_pred may be negative for p=0.
140
+
141
+ Link:
142
+ y_pred = raw_prediction
143
+
144
+ Half Tweedie deviance with identity link and p=power is
145
+ max(y_true, 0)**(2-p) / (1-p) / (2-p)
146
+ - y_true * y_pred**(1-p) / (1-p)
147
+ + y_pred**(2-p) / (2-p)
148
+
149
+ Notes:
150
+ - Here, we do not drop constant terms in contrast to the version with log-link.
151
+ """
152
+ )
153
+
154
+ doc_HalfBinomialLoss = (
155
+ """Half Binomial deviance loss with logit link.
156
+
157
+ Domain:
158
+ y_true in [0, 1]
159
+ y_pred in (0, 1), i.e. boundaries excluded
160
+
161
+ Link:
162
+ y_pred = expit(raw_prediction)
163
+ """
164
+ )
165
+
166
+ doc_ExponentialLoss = (
167
+ """"Exponential loss with (half) logit link
168
+
169
+ Domain:
170
+ y_true in [0, 1]
171
+ y_pred in (0, 1), i.e. boundaries excluded
172
+
173
+ Link:
174
+ y_pred = expit(2 * raw_prediction)
175
+ """
176
+ )
177
+
178
+ # loss class name, docstring, param,
179
+ # cy_loss, cy_loss_grad,
180
+ # cy_grad, cy_grad_hess,
181
+ class_list = [
182
+ ("CyHalfSquaredError", doc_HalfSquaredError, None,
183
+ "closs_half_squared_error", None,
184
+ "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
185
+ ("CyAbsoluteError", doc_AbsoluteError, None,
186
+ "closs_absolute_error", None,
187
+ "cgradient_absolute_error", "cgrad_hess_absolute_error"),
188
+ ("CyPinballLoss", doc_PinballLoss, "quantile",
189
+ "closs_pinball_loss", None,
190
+ "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
191
+ ("CyHuberLoss", doc_HuberLoss, "delta",
192
+ "closs_huber_loss", None,
193
+ "cgradient_huber_loss", "cgrad_hess_huber_loss"),
194
+ ("CyHalfPoissonLoss", doc_HalfPoissonLoss, None,
195
+ "closs_half_poisson", "closs_grad_half_poisson",
196
+ "cgradient_half_poisson", "cgrad_hess_half_poisson"),
197
+ ("CyHalfGammaLoss", doc_HalfGammaLoss, None,
198
+ "closs_half_gamma", "closs_grad_half_gamma",
199
+ "cgradient_half_gamma", "cgrad_hess_half_gamma"),
200
+ ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
201
+ "closs_half_tweedie", "closs_grad_half_tweedie",
202
+ "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
203
+ ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
204
+ "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
205
+ "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
206
+ ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
207
+ "closs_half_binomial", "closs_grad_half_binomial",
208
+ "cgradient_half_binomial", "cgrad_hess_half_binomial"),
209
+ ("CyExponentialLoss", doc_ExponentialLoss, None,
210
+ "closs_exponential", "closs_grad_exponential",
211
+ "cgradient_exponential", "cgrad_hess_exponential"),
212
+ ]
213
+ }}
214
+
215
+ # Design:
216
+ # See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
217
+ # a) Merge link functions into loss functions for speed and numerical
218
+ # stability, i.e. use raw_prediction instead of y_pred in signature.
219
+ # b) Pure C functions (nogil) calculate single points (single sample)
220
+ # c) Wrap C functions in a loop to get Python functions operating on ndarrays.
221
+ # - Write loops manually---use Tempita for this.
222
+ # Reason: There is still some performance overhead when using a wrapper
223
+ # function "wrap" that carries out the loop and gets as argument a function
224
+ # pointer to one of the C functions from b), e.g.
225
+ # wrap(closs_half_poisson, y_true, ...)
226
+ # - Pass n_threads as argument to prange and propagate option to all callers.
227
+ # d) Provide classes (Cython extension types) per loss (names start with Cy) in
228
+ # order to have semantical structured objects.
229
+ # - Member functions for single points just call the C function from b).
230
+ # These are used e.g. in SGD `_plain_sgd`.
231
+ # - Member functions operating on ndarrays, see c), looping over calls to C
232
+ # functions from b).
233
+ # e) Provide convenience Python classes that compose from these extension types
234
+ # elsewhere (see loss.py)
235
+ # - Example: loss.gradient calls CyLoss.gradient but does some input
236
+ # checking like None -> np.empty().
237
+ #
238
+ # Note: We require 1-dim ndarrays to be contiguous.
239
+
240
+ from cython.parallel import parallel, prange
241
+ import numpy as np
242
+
243
+ from libc.math cimport exp, fabs, log, log1p, pow
244
+ from libc.stdlib cimport malloc, free
245
+
246
+
247
+ # -------------------------------------
248
+ # Helper functions
249
+ # -------------------------------------
250
+ # Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of
251
+ # https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
252
+ # Note: The only important cutoff is at x = 18. All others are to save computation
253
+ # time. Compared to the reference, we add the additional case distinction x <= -2 in
254
+ # order to use log instead of log1p for improved performance. As with the other
255
+ # cutoffs, this is accurate within machine precision of double.
256
+ cdef inline double log1pexp(double x) noexcept nogil:
257
+ if x <= -37:
258
+ return exp(x)
259
+ elif x <= -2:
260
+ return log1p(exp(x))
261
+ elif x <= 18:
262
+ return log(1. + exp(x))
263
+ elif x <= 33.3:
264
+ return x + exp(-x)
265
+ else:
266
+ return x
267
+
268
+
269
+ cdef inline double_pair sum_exp_minus_max(
270
+ const int i,
271
+ const floating_in[:, :] raw_prediction, # IN
272
+ floating_out *p # OUT
273
+ ) noexcept nogil:
274
+ # Thread local buffers are used to store part of the results via p.
275
+ # The results are stored as follows:
276
+ # p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
277
+ # return.val1 = max_value = max(raw_prediction_i_k, k = 0 to n_classes-1)
278
+ # return.val2 = sum_exps = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
279
+ # len(p) must be n_classes
280
+ # Notes:
281
+ # - We return the max value and sum of exps (stored in p) as a double_pair.
282
+ # - i needs to be passed (and stays constant) because otherwise Cython does
283
+ # not generate optimal code, see
284
+ # https://github.com/scikit-learn/scikit-learn/issues/17299
285
+ # - We do not normalize p by calculating p[k] = p[k] / sum_exps.
286
+ # This helps to save one loop over k.
287
+ cdef:
288
+ int k
289
+ int n_classes = raw_prediction.shape[1]
290
+ double_pair max_value_and_sum_exps # val1 = max_value, val2 = sum_exps
291
+
292
+ max_value_and_sum_exps.val1 = raw_prediction[i, 0]
293
+ max_value_and_sum_exps.val2 = 0
294
+ for k in range(1, n_classes):
295
+ # Compute max value of array for numerical stability
296
+ if max_value_and_sum_exps.val1 < raw_prediction[i, k]:
297
+ max_value_and_sum_exps.val1 = raw_prediction[i, k]
298
+
299
+ for k in range(n_classes):
300
+ p[k] = exp(raw_prediction[i, k] - max_value_and_sum_exps.val1)
301
+ max_value_and_sum_exps.val2 += p[k]
302
+
303
+ return max_value_and_sum_exps
304
+
305
+
306
+ # -------------------------------------
307
+ # Single point inline C functions
308
+ # -------------------------------------
309
+ # Half Squared Error
310
+ cdef inline double closs_half_squared_error(
311
+ double y_true,
312
+ double raw_prediction
313
+ ) noexcept nogil:
314
+ return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
315
+
316
+
317
+ cdef inline double cgradient_half_squared_error(
318
+ double y_true,
319
+ double raw_prediction
320
+ ) noexcept nogil:
321
+ return raw_prediction - y_true
322
+
323
+
324
+ cdef inline double_pair cgrad_hess_half_squared_error(
325
+ double y_true,
326
+ double raw_prediction
327
+ ) noexcept nogil:
328
+ cdef double_pair gh
329
+ gh.val1 = raw_prediction - y_true # gradient
330
+ gh.val2 = 1. # hessian
331
+ return gh
332
+
333
+
334
+ # Absolute Error
335
+ cdef inline double closs_absolute_error(
336
+ double y_true,
337
+ double raw_prediction
338
+ ) noexcept nogil:
339
+ return fabs(raw_prediction - y_true)
340
+
341
+
342
+ cdef inline double cgradient_absolute_error(
343
+ double y_true,
344
+ double raw_prediction
345
+ ) noexcept nogil:
346
+ return 1. if raw_prediction > y_true else -1.
347
+
348
+
349
+ cdef inline double_pair cgrad_hess_absolute_error(
350
+ double y_true,
351
+ double raw_prediction
352
+ ) noexcept nogil:
353
+ cdef double_pair gh
354
+ # Note that exact hessian = 0 almost everywhere. Optimization routines like
355
+ # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
356
+ gh.val1 = 1. if raw_prediction > y_true else -1. # gradient
357
+ gh.val2 = 1. # hessian
358
+ return gh
359
+
360
+
361
+ # Quantile Loss / Pinball Loss
362
+ cdef inline double closs_pinball_loss(
363
+ double y_true,
364
+ double raw_prediction,
365
+ double quantile
366
+ ) noexcept nogil:
367
+ return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
368
+ else (1. - quantile) * (raw_prediction - y_true))
369
+
370
+
371
+ cdef inline double cgradient_pinball_loss(
372
+ double y_true,
373
+ double raw_prediction,
374
+ double quantile
375
+ ) noexcept nogil:
376
+ return -quantile if y_true >=raw_prediction else 1. - quantile
377
+
378
+
379
+ cdef inline double_pair cgrad_hess_pinball_loss(
380
+ double y_true,
381
+ double raw_prediction,
382
+ double quantile
383
+ ) noexcept nogil:
384
+ cdef double_pair gh
385
+ # Note that exact hessian = 0 almost everywhere. Optimization routines like
386
+ # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
387
+ gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile # gradient
388
+ gh.val2 = 1. # hessian
389
+ return gh
390
+
391
+
392
+ # Huber Loss
393
+ cdef inline double closs_huber_loss(
394
+ double y_true,
395
+ double raw_prediction,
396
+ double delta,
397
+ ) noexcept nogil:
398
+ cdef double abserr = fabs(y_true - raw_prediction)
399
+ if abserr <= delta:
400
+ return 0.5 * abserr**2
401
+ else:
402
+ return delta * (abserr - 0.5 * delta)
403
+
404
+
405
+ cdef inline double cgradient_huber_loss(
406
+ double y_true,
407
+ double raw_prediction,
408
+ double delta,
409
+ ) noexcept nogil:
410
+ cdef double res = raw_prediction - y_true
411
+ if fabs(res) <= delta:
412
+ return res
413
+ else:
414
+ return delta if res >=0 else -delta
415
+
416
+
417
+ cdef inline double_pair cgrad_hess_huber_loss(
418
+ double y_true,
419
+ double raw_prediction,
420
+ double delta,
421
+ ) noexcept nogil:
422
+ cdef double_pair gh
423
+ gh.val2 = raw_prediction - y_true # used as temporary
424
+ if fabs(gh.val2) <= delta:
425
+ gh.val1 = gh.val2 # gradient
426
+ gh.val2 = 1 # hessian
427
+ else:
428
+ gh.val1 = delta if gh.val2 >=0 else -delta # gradient
429
+ gh.val2 = 0 # hessian
430
+ return gh
431
+
432
+
433
+ # Half Poisson Deviance with Log-Link, dropping constant terms
434
+ cdef inline double closs_half_poisson(
435
+ double y_true,
436
+ double raw_prediction
437
+ ) noexcept nogil:
438
+ return exp(raw_prediction) - y_true * raw_prediction
439
+
440
+
441
+ cdef inline double cgradient_half_poisson(
442
+ double y_true,
443
+ double raw_prediction
444
+ ) noexcept nogil:
445
+ # y_pred - y_true
446
+ return exp(raw_prediction) - y_true
447
+
448
+
449
+ cdef inline double_pair closs_grad_half_poisson(
450
+ double y_true,
451
+ double raw_prediction
452
+ ) noexcept nogil:
453
+ cdef double_pair lg
454
+ lg.val2 = exp(raw_prediction) # used as temporary
455
+ lg.val1 = lg.val2 - y_true * raw_prediction # loss
456
+ lg.val2 -= y_true # gradient
457
+ return lg
458
+
459
+
460
+ cdef inline double_pair cgrad_hess_half_poisson(
461
+ double y_true,
462
+ double raw_prediction
463
+ ) noexcept nogil:
464
+ cdef double_pair gh
465
+ gh.val2 = exp(raw_prediction) # hessian
466
+ gh.val1 = gh.val2 - y_true # gradient
467
+ return gh
468
+
469
+
470
+ # Half Gamma Deviance with Log-Link, dropping constant terms
471
+ cdef inline double closs_half_gamma(
472
+ double y_true,
473
+ double raw_prediction
474
+ ) noexcept nogil:
475
+ return raw_prediction + y_true * exp(-raw_prediction)
476
+
477
+
478
+ cdef inline double cgradient_half_gamma(
479
+ double y_true,
480
+ double raw_prediction
481
+ ) noexcept nogil:
482
+ return 1. - y_true * exp(-raw_prediction)
483
+
484
+
485
+ cdef inline double_pair closs_grad_half_gamma(
486
+ double y_true,
487
+ double raw_prediction
488
+ ) noexcept nogil:
489
+ cdef double_pair lg
490
+ lg.val2 = exp(-raw_prediction) # used as temporary
491
+ lg.val1 = raw_prediction + y_true * lg.val2 # loss
492
+ lg.val2 = 1. - y_true * lg.val2 # gradient
493
+ return lg
494
+
495
+
496
+ cdef inline double_pair cgrad_hess_half_gamma(
497
+ double y_true,
498
+ double raw_prediction
499
+ ) noexcept nogil:
500
+ cdef double_pair gh
501
+ gh.val2 = exp(-raw_prediction) # used as temporary
502
+ gh.val1 = 1. - y_true * gh.val2 # gradient
503
+ gh.val2 *= y_true # hessian
504
+ return gh
505
+
506
+
507
+ # Half Tweedie Deviance with Log-Link, dropping constant terms
508
+ # Note that by dropping constants this is no longer continuous in parameter power.
509
+ cdef inline double closs_half_tweedie(
510
+ double y_true,
511
+ double raw_prediction,
512
+ double power
513
+ ) noexcept nogil:
514
+ if power == 0.:
515
+ return closs_half_squared_error(y_true, exp(raw_prediction))
516
+ elif power == 1.:
517
+ return closs_half_poisson(y_true, raw_prediction)
518
+ elif power == 2.:
519
+ return closs_half_gamma(y_true, raw_prediction)
520
+ else:
521
+ return (exp((2. - power) * raw_prediction) / (2. - power)
522
+ - y_true * exp((1. - power) * raw_prediction) / (1. - power))
523
+
524
+
525
+ cdef inline double cgradient_half_tweedie(
526
+ double y_true,
527
+ double raw_prediction,
528
+ double power
529
+ ) noexcept nogil:
530
+ cdef double exp1
531
+ if power == 0.:
532
+ exp1 = exp(raw_prediction)
533
+ return exp1 * (exp1 - y_true)
534
+ elif power == 1.:
535
+ return cgradient_half_poisson(y_true, raw_prediction)
536
+ elif power == 2.:
537
+ return cgradient_half_gamma(y_true, raw_prediction)
538
+ else:
539
+ return (exp((2. - power) * raw_prediction)
540
+ - y_true * exp((1. - power) * raw_prediction))
541
+
542
+
543
+ cdef inline double_pair closs_grad_half_tweedie(
544
+ double y_true,
545
+ double raw_prediction,
546
+ double power
547
+ ) noexcept nogil:
548
+ cdef double_pair lg
549
+ cdef double exp1, exp2
550
+ if power == 0.:
551
+ exp1 = exp(raw_prediction)
552
+ lg.val1 = closs_half_squared_error(y_true, exp1) # loss
553
+ lg.val2 = exp1 * (exp1 - y_true) # gradient
554
+ elif power == 1.:
555
+ return closs_grad_half_poisson(y_true, raw_prediction)
556
+ elif power == 2.:
557
+ return closs_grad_half_gamma(y_true, raw_prediction)
558
+ else:
559
+ exp1 = exp((1. - power) * raw_prediction)
560
+ exp2 = exp((2. - power) * raw_prediction)
561
+ lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power) # loss
562
+ lg.val2 = exp2 - y_true * exp1 # gradient
563
+ return lg
564
+
565
+
566
+ cdef inline double_pair cgrad_hess_half_tweedie(
567
+ double y_true,
568
+ double raw_prediction,
569
+ double power
570
+ ) noexcept nogil:
571
+ cdef double_pair gh
572
+ cdef double exp1, exp2
573
+ if power == 0.:
574
+ exp1 = exp(raw_prediction)
575
+ gh.val1 = exp1 * (exp1 - y_true) # gradient
576
+ gh.val2 = exp1 * (2 * exp1 - y_true) # hessian
577
+ elif power == 1.:
578
+ return cgrad_hess_half_poisson(y_true, raw_prediction)
579
+ elif power == 2.:
580
+ return cgrad_hess_half_gamma(y_true, raw_prediction)
581
+ else:
582
+ exp1 = exp((1. - power) * raw_prediction)
583
+ exp2 = exp((2. - power) * raw_prediction)
584
+ gh.val1 = exp2 - y_true * exp1 # gradient
585
+ gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1 # hessian
586
+ return gh
587
+
588
+
589
+ # Half Tweedie Deviance with identity link, without dropping constant terms!
590
+ # Therefore, best loss value is zero.
591
+ cdef inline double closs_half_tweedie_identity(
592
+ double y_true,
593
+ double raw_prediction,
594
+ double power
595
+ ) noexcept nogil:
596
+ cdef double tmp
597
+ if power == 0.:
598
+ return closs_half_squared_error(y_true, raw_prediction)
599
+ elif power == 1.:
600
+ if y_true == 0:
601
+ return raw_prediction
602
+ else:
603
+ return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
604
+ elif power == 2.:
605
+ return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
606
+ else:
607
+ tmp = pow(raw_prediction, 1. - power)
608
+ tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
609
+ if y_true > 0:
610
+ tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
611
+ return tmp
612
+
613
+
614
+ cdef inline double cgradient_half_tweedie_identity(
615
+ double y_true,
616
+ double raw_prediction,
617
+ double power
618
+ ) noexcept nogil:
619
+ if power == 0.:
620
+ return raw_prediction - y_true
621
+ elif power == 1.:
622
+ return 1. - y_true / raw_prediction
623
+ elif power == 2.:
624
+ return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
625
+ else:
626
+ return pow(raw_prediction, -power) * (raw_prediction - y_true)
627
+
628
+
629
+ cdef inline double_pair closs_grad_half_tweedie_identity(
630
+ double y_true,
631
+ double raw_prediction,
632
+ double power
633
+ ) noexcept nogil:
634
+ cdef double_pair lg
635
+ cdef double tmp
636
+ if power == 0.:
637
+ lg.val2 = raw_prediction - y_true # gradient
638
+ lg.val1 = 0.5 * lg.val2 * lg.val2 # loss
639
+ elif power == 1.:
640
+ if y_true == 0:
641
+ lg.val1 = raw_prediction
642
+ else:
643
+ lg.val1 = (y_true * log(y_true/raw_prediction) # loss
644
+ + raw_prediction - y_true)
645
+ lg.val2 = 1. - y_true / raw_prediction # gradient
646
+ elif power == 2.:
647
+ lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1. # loss
648
+ tmp = raw_prediction * raw_prediction
649
+ lg.val2 = (raw_prediction - y_true) / tmp # gradient
650
+ else:
651
+ tmp = pow(raw_prediction, 1. - power)
652
+ lg.val1 = (raw_prediction * tmp / (2. - power) # loss
653
+ - y_true * tmp / (1. - power))
654
+ if y_true > 0:
655
+ lg.val1 += (pow(y_true, 2. - power)
656
+ / ((1. - power) * (2. - power)))
657
+ lg.val2 = tmp * (1. - y_true / raw_prediction) # gradient
658
+ return lg
659
+
660
+
661
+ cdef inline double_pair cgrad_hess_half_tweedie_identity(
662
+ double y_true,
663
+ double raw_prediction,
664
+ double power
665
+ ) noexcept nogil:
666
+ cdef double_pair gh
667
+ cdef double tmp
668
+ if power == 0.:
669
+ gh.val1 = raw_prediction - y_true # gradient
670
+ gh.val2 = 1. # hessian
671
+ elif power == 1.:
672
+ gh.val1 = 1. - y_true / raw_prediction # gradient
673
+ gh.val2 = y_true / (raw_prediction * raw_prediction) # hessian
674
+ elif power == 2.:
675
+ tmp = raw_prediction * raw_prediction
676
+ gh.val1 = (raw_prediction - y_true) / tmp # gradient
677
+ gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp # hessian
678
+ else:
679
+ tmp = pow(raw_prediction, -power)
680
+ gh.val1 = tmp * (raw_prediction - y_true) # gradient
681
+ gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction) # hessian
682
+ return gh
683
+
684
+
685
+ # Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
686
+ cdef inline double closs_half_binomial(
687
+ double y_true,
688
+ double raw_prediction
689
+ ) noexcept nogil:
690
+ # log1p(exp(raw_prediction)) - y_true * raw_prediction
691
+ return log1pexp(raw_prediction) - y_true * raw_prediction
692
+
693
+
694
+ cdef inline double cgradient_half_binomial(
695
+ double y_true,
696
+ double raw_prediction
697
+ ) noexcept nogil:
698
+ # gradient = y_pred - y_true = expit(raw_prediction) - y_true
699
+ # Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
700
+ # if raw_prediction < 0:
701
+ # exp_tmp = exp(raw_prediction)
702
+ # return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
703
+ # else:
704
+ # exp_tmp = exp(-raw_prediction)
705
+ # return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
706
+ # Note that optimal speed would be achieved, at the cost of precision, by
707
+ # return expit(raw_prediction) - y_true
708
+ # i.e. no "if else" and an own inline implementation of expit instead of
709
+ # from scipy.special.cython_special cimport expit
710
+ # The case distinction raw_prediction < 0 in the stable implementation does not
711
+ # provide significant better precision apart from protecting overflow of exp(..).
712
+ # The branch (if else), however, can incur runtime costs of up to 30%.
713
+ # Instead, we help branch prediction by almost always ending in the first if clause
714
+ # and making the second branch (else) a bit simpler. This has the exact same
715
+ # precision but is faster than the stable implementation.
716
+ # As branching criteria, we use the same cutoff as in log1pexp. Note that the
717
+ # maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
718
+ # (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
719
+ cdef double exp_tmp
720
+ if raw_prediction > -37:
721
+ exp_tmp = exp(-raw_prediction)
722
+ return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
723
+ else:
724
+ # expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
725
+ return exp(raw_prediction) - y_true
726
+
727
+
728
+ cdef inline double_pair closs_grad_half_binomial(
729
+ double y_true,
730
+ double raw_prediction
731
+ ) noexcept nogil:
732
+ cdef double_pair lg
733
+ # Same if else conditions as in log1pexp.
734
+ if raw_prediction <= -37:
735
+ lg.val2 = exp(raw_prediction) # used as temporary
736
+ lg.val1 = lg.val2 - y_true * raw_prediction # loss
737
+ lg.val2 -= y_true # gradient
738
+ elif raw_prediction <= -2:
739
+ lg.val2 = exp(raw_prediction) # used as temporary
740
+ lg.val1 = log1p(lg.val2) - y_true * raw_prediction # loss
741
+ lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2) # gradient
742
+ elif raw_prediction <= 18:
743
+ lg.val2 = exp(-raw_prediction) # used as temporary
744
+ # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
745
+ lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction # loss
746
+ lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
747
+ else:
748
+ lg.val2 = exp(-raw_prediction) # used as temporary
749
+ lg.val1 = lg.val2 + (1 - y_true) * raw_prediction # loss
750
+ lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2) # gradient
751
+ return lg
752
+
753
+
754
+ cdef inline double_pair cgrad_hess_half_binomial(
755
+ double y_true,
756
+ double raw_prediction
757
+ ) noexcept nogil:
758
+ # with y_pred = expit(raw)
759
+ # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
760
+ # = exp(-raw) / (1 + exp(-raw))**2
761
+ cdef double_pair gh
762
+ # See comment in cgradient_half_binomial.
763
+ if raw_prediction > -37:
764
+ gh.val2 = exp(-raw_prediction) # used as temporary
765
+ gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2) # gradient
766
+ gh.val2 = gh.val2 / (1 + gh.val2)**2 # hessian
767
+ else:
768
+ gh.val2 = exp(raw_prediction) # = 1. order Taylor in exp(raw_prediction)
769
+ gh.val1 = gh.val2 - y_true
770
+ return gh
771
+
772
+
773
+ # Exponential loss with (half) logit-link, aka boosting loss
774
+ cdef inline double closs_exponential(
775
+ double y_true,
776
+ double raw_prediction
777
+ ) noexcept nogil:
778
+ cdef double tmp = exp(raw_prediction)
779
+ return y_true / tmp + (1 - y_true) * tmp
780
+
781
+
782
+ cdef inline double cgradient_exponential(
783
+ double y_true,
784
+ double raw_prediction
785
+ ) noexcept nogil:
786
+ cdef double tmp = exp(raw_prediction)
787
+ return -y_true / tmp + (1 - y_true) * tmp
788
+
789
+
790
+ cdef inline double_pair closs_grad_exponential(
791
+ double y_true,
792
+ double raw_prediction
793
+ ) noexcept nogil:
794
+ cdef double_pair lg
795
+ lg.val2 = exp(raw_prediction) # used as temporary
796
+
797
+ lg.val1 = y_true / lg.val2 + (1 - y_true) * lg.val2 # loss
798
+ lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2 # gradient
799
+ return lg
800
+
801
+
802
+ cdef inline double_pair cgrad_hess_exponential(
803
+ double y_true,
804
+ double raw_prediction
805
+ ) noexcept nogil:
806
+ # Note that hessian = loss
807
+ cdef double_pair gh
808
+ gh.val2 = exp(raw_prediction) # used as temporary
809
+
810
+ gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2 # gradient
811
+ gh.val2 = y_true / gh.val2 + (1 - y_true) * gh.val2 # hessian
812
+ return gh
813
+
814
+
815
+ # ---------------------------------------------------
816
+ # Extension Types for Loss Functions of 1-dim targets
817
+ # ---------------------------------------------------
818
+ cdef class CyLossFunction:
819
+ """Base class for convex loss functions."""
820
+
821
+ def __reduce__(self):
822
+ return (self.__class__, ())
823
+
824
+ cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
825
+ """Compute the loss for a single sample.
826
+
827
+ Parameters
828
+ ----------
829
+ y_true : double
830
+ Observed, true target value.
831
+ raw_prediction : double
832
+ Raw prediction value (in link space).
833
+
834
+ Returns
835
+ -------
836
+ double
837
+ The loss evaluated at `y_true` and `raw_prediction`.
838
+ """
839
+ pass
840
+
841
+ cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
842
+ """Compute gradient of loss w.r.t. raw_prediction for a single sample.
843
+
844
+ Parameters
845
+ ----------
846
+ y_true : double
847
+ Observed, true target value.
848
+ raw_prediction : double
849
+ Raw prediction value (in link space).
850
+
851
+ Returns
852
+ -------
853
+ double
854
+ The derivative of the loss function w.r.t. `raw_prediction`.
855
+ """
856
+ pass
857
+
858
+ cdef double_pair cy_grad_hess(
859
+ self, double y_true, double raw_prediction
860
+ ) noexcept nogil:
861
+ """Compute gradient and hessian.
862
+
863
+ Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
864
+
865
+ This is usually diagonal in raw_prediction_i and raw_prediction_j.
866
+ Therefore, we return the diagonal element i=j.
867
+
868
+ For a loss with a non-canonical link, this might implement the diagonal
869
+ of the Fisher matrix (=expected hessian) instead of the hessian.
870
+
871
+ Parameters
872
+ ----------
873
+ y_true : double
874
+ Observed, true target value.
875
+ raw_prediction : double
876
+ Raw prediction value (in link space).
877
+
878
+ Returns
879
+ -------
880
+ double_pair
881
+ Gradient and hessian of the loss function w.r.t. `raw_prediction`.
882
+ """
883
+ pass
884
+
885
+ def loss(
886
+ self,
887
+ const floating_in[::1] y_true, # IN
888
+ const floating_in[::1] raw_prediction, # IN
889
+ const floating_in[::1] sample_weight, # IN
890
+ floating_out[::1] loss_out, # OUT
891
+ int n_threads=1
892
+ ):
893
+ """Compute the point-wise loss value for each input.
894
+
895
+ The point-wise loss is written to `loss_out` and no array is returned.
896
+
897
+ Parameters
898
+ ----------
899
+ y_true : array of shape (n_samples,)
900
+ Observed, true target values.
901
+ raw_prediction : array of shape (n_samples,)
902
+ Raw prediction values (in link space).
903
+ sample_weight : array of shape (n_samples,) or None
904
+ Sample weights.
905
+ loss_out : array of shape (n_samples,)
906
+ A location into which the result is stored.
907
+ n_threads : int
908
+ Number of threads used by OpenMP (if any).
909
+ """
910
+ pass
911
+
912
+ def gradient(
913
+ self,
914
+ const floating_in[::1] y_true, # IN
915
+ const floating_in[::1] raw_prediction, # IN
916
+ const floating_in[::1] sample_weight, # IN
917
+ floating_out[::1] gradient_out, # OUT
918
+ int n_threads=1
919
+ ):
920
+ """Compute gradient of loss w.r.t raw_prediction for each input.
921
+
922
+ The gradient is written to `gradient_out` and no array is returned.
923
+
924
+ Parameters
925
+ ----------
926
+ y_true : array of shape (n_samples,)
927
+ Observed, true target values.
928
+ raw_prediction : array of shape (n_samples,)
929
+ Raw prediction values (in link space).
930
+ sample_weight : array of shape (n_samples,) or None
931
+ Sample weights.
932
+ gradient_out : array of shape (n_samples,)
933
+ A location into which the result is stored.
934
+ n_threads : int
935
+ Number of threads used by OpenMP (if any).
936
+ """
937
+ pass
938
+
939
+ def loss_gradient(
940
+ self,
941
+ const floating_in[::1] y_true, # IN
942
+ const floating_in[::1] raw_prediction, # IN
943
+ const floating_in[::1] sample_weight, # IN
944
+ floating_out[::1] loss_out, # OUT
945
+ floating_out[::1] gradient_out, # OUT
946
+ int n_threads=1
947
+ ):
948
+ """Compute loss and gradient of loss w.r.t raw_prediction.
949
+
950
+ The loss and gradient are written to `loss_out` and `gradient_out` and no arrays
951
+ are returned.
952
+
953
+ Parameters
954
+ ----------
955
+ y_true : array of shape (n_samples,)
956
+ Observed, true target values.
957
+ raw_prediction : array of shape (n_samples,)
958
+ Raw prediction values (in link space).
959
+ sample_weight : array of shape (n_samples,) or None
960
+ Sample weights.
961
+ loss_out : array of shape (n_samples,) or None
962
+ A location into which the element-wise loss is stored.
963
+ gradient_out : array of shape (n_samples,)
964
+ A location into which the gradient is stored.
965
+ n_threads : int
966
+ Number of threads used by OpenMP (if any).
967
+ """
968
+ self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
969
+ self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
970
+
971
+ def gradient_hessian(
972
+ self,
973
+ const floating_in[::1] y_true, # IN
974
+ const floating_in[::1] raw_prediction, # IN
975
+ const floating_in[::1] sample_weight, # IN
976
+ floating_out[::1] gradient_out, # OUT
977
+ floating_out[::1] hessian_out, # OUT
978
+ int n_threads=1
979
+ ):
980
+ """Compute gradient and hessian of loss w.r.t raw_prediction.
981
+
982
+ The gradient and hessian are written to `gradient_out` and `hessian_out` and no
983
+ arrays are returned.
984
+
985
+ Parameters
986
+ ----------
987
+ y_true : array of shape (n_samples,)
988
+ Observed, true target values.
989
+ raw_prediction : array of shape (n_samples,)
990
+ Raw prediction values (in link space).
991
+ sample_weight : array of shape (n_samples,) or None
992
+ Sample weights.
993
+ gradient_out : array of shape (n_samples,)
994
+ A location into which the gradient is stored.
995
+ hessian_out : array of shape (n_samples,)
996
+ A location into which the hessian is stored.
997
+ n_threads : int
998
+ Number of threads used by OpenMP (if any).
999
+ """
1000
+ pass
1001
+
1002
+
1003
+ {{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
1004
+ {{py:
1005
+ if param is None:
1006
+ with_param = ""
1007
+ else:
1008
+ with_param = ", self." + param
1009
+ }}
1010
+
1011
+ cdef class {{name}}(CyLossFunction):
1012
+ """{{docstring}}"""
1013
+
1014
+ {{if param is not None}}
1015
+ def __init__(self, {{param}}):
1016
+ self.{{param}} = {{param}}
1017
+ {{endif}}
1018
+
1019
+ {{if param is not None}}
1020
+ def __reduce__(self):
1021
+ return (self.__class__, (self.{{param}},))
1022
+ {{endif}}
1023
+
1024
+ cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
1025
+ return {{closs}}(y_true, raw_prediction{{with_param}})
1026
+
1027
+ cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
1028
+ return {{cgrad}}(y_true, raw_prediction{{with_param}})
1029
+
1030
+ cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil:
1031
+ return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
1032
+
1033
+ def loss(
1034
+ self,
1035
+ const floating_in[::1] y_true, # IN
1036
+ const floating_in[::1] raw_prediction, # IN
1037
+ const floating_in[::1] sample_weight, # IN
1038
+ floating_out[::1] loss_out, # OUT
1039
+ int n_threads=1
1040
+ ):
1041
+ cdef:
1042
+ int i
1043
+ int n_samples = y_true.shape[0]
1044
+
1045
+ if sample_weight is None:
1046
+ for i in prange(
1047
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1048
+ ):
1049
+ loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
1050
+ else:
1051
+ for i in prange(
1052
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1053
+ ):
1054
+ loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
1055
+
1056
+ {{if closs_grad is not None}}
1057
+ def loss_gradient(
1058
+ self,
1059
+ const floating_in[::1] y_true, # IN
1060
+ const floating_in[::1] raw_prediction, # IN
1061
+ const floating_in[::1] sample_weight, # IN
1062
+ floating_out[::1] loss_out, # OUT
1063
+ floating_out[::1] gradient_out, # OUT
1064
+ int n_threads=1
1065
+ ):
1066
+ cdef:
1067
+ int i
1068
+ int n_samples = y_true.shape[0]
1069
+ double_pair dbl2
1070
+
1071
+ if sample_weight is None:
1072
+ for i in prange(
1073
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1074
+ ):
1075
+ dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
1076
+ loss_out[i] = dbl2.val1
1077
+ gradient_out[i] = dbl2.val2
1078
+ else:
1079
+ for i in prange(
1080
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1081
+ ):
1082
+ dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
1083
+ loss_out[i] = sample_weight[i] * dbl2.val1
1084
+ gradient_out[i] = sample_weight[i] * dbl2.val2
1085
+
1086
+ {{endif}}
1087
+
1088
+ def gradient(
1089
+ self,
1090
+ const floating_in[::1] y_true, # IN
1091
+ const floating_in[::1] raw_prediction, # IN
1092
+ const floating_in[::1] sample_weight, # IN
1093
+ floating_out[::1] gradient_out, # OUT
1094
+ int n_threads=1
1095
+ ):
1096
+ cdef:
1097
+ int i
1098
+ int n_samples = y_true.shape[0]
1099
+
1100
+ if sample_weight is None:
1101
+ for i in prange(
1102
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1103
+ ):
1104
+ gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
1105
+ else:
1106
+ for i in prange(
1107
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1108
+ ):
1109
+ gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
1110
+
1111
+ def gradient_hessian(
1112
+ self,
1113
+ const floating_in[::1] y_true, # IN
1114
+ const floating_in[::1] raw_prediction, # IN
1115
+ const floating_in[::1] sample_weight, # IN
1116
+ floating_out[::1] gradient_out, # OUT
1117
+ floating_out[::1] hessian_out, # OUT
1118
+ int n_threads=1
1119
+ ):
1120
+ cdef:
1121
+ int i
1122
+ int n_samples = y_true.shape[0]
1123
+ double_pair dbl2
1124
+
1125
+ if sample_weight is None:
1126
+ for i in prange(
1127
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1128
+ ):
1129
+ dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
1130
+ gradient_out[i] = dbl2.val1
1131
+ hessian_out[i] = dbl2.val2
1132
+ else:
1133
+ for i in prange(
1134
+ n_samples, schedule='static', nogil=True, num_threads=n_threads
1135
+ ):
1136
+ dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
1137
+ gradient_out[i] = sample_weight[i] * dbl2.val1
1138
+ hessian_out[i] = sample_weight[i] * dbl2.val2
1139
+
1140
+ {{endfor}}
1141
+
1142
+
1143
+ # The multinomial deviance loss is also known as categorical cross-entropy or
1144
+ # multinomial log-likelihood.
1145
+ # Here, we do not inherit from CyLossFunction as its cy_gradient method deviates
1146
+ # from the API.
1147
+ cdef class CyHalfMultinomialLoss():
1148
+ """Half Multinomial deviance loss with multinomial logit link.
1149
+
1150
+ Domain:
1151
+ y_true in {0, 1, 2, 3, .., n_classes - 1}
1152
+ y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded
1153
+
1154
+ Link:
1155
+ y_pred = softmax(raw_prediction)
1156
+
1157
+ Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
1158
+ mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
1159
+ """
1160
+
1161
+ # Here we deviate from the CyLossFunction API. SAG/SAGA needs direct access to
1162
+ # sample-wise gradients which we provide here.
1163
+ cdef inline void cy_gradient(
1164
+ self,
1165
+ const floating_in y_true,
1166
+ const floating_in[::1] raw_prediction, # IN
1167
+ const floating_in sample_weight,
1168
+ floating_out[::1] gradient_out, # OUT
1169
+ ) noexcept nogil:
1170
+ """Compute gradient of loss w.r.t. `raw_prediction` for a single sample.
1171
+
1172
+ The gradient of the multinomial logistic loss with respect to a class k,
1173
+ and for one sample is:
1174
+ grad_k = - sw * (p[k] - (y==k))
1175
+
1176
+ where:
1177
+ p[k] = proba[k] = exp(raw_prediction[k] - logsumexp(raw_prediction))
1178
+ sw = sample_weight
1179
+
1180
+ Parameters
1181
+ ----------
1182
+ y_true : double
1183
+ Observed, true target value.
1184
+ raw_prediction : array of shape (n_classes,)
1185
+ Raw prediction values (in link space).
1186
+ sample_weight : double
1187
+ Sample weight.
1188
+ gradient_out : array of shape (n_classs,)
1189
+ A location into which the gradient is stored.
1190
+
1191
+ Returns
1192
+ -------
1193
+ gradient : double
1194
+ The derivative of the loss function w.r.t. `raw_prediction`.
1195
+ """
1196
+ cdef:
1197
+ int k
1198
+ int n_classes = raw_prediction.shape[0]
1199
+ double_pair max_value_and_sum_exps
1200
+ const floating_in[:, :] raw = raw_prediction[None, :]
1201
+
1202
+ max_value_and_sum_exps = sum_exp_minus_max(0, raw, &gradient_out[0])
1203
+ for k in range(n_classes):
1204
+ # gradient_out[k] = p_k = y_pred_k = prob of class k
1205
+ gradient_out[k] /= max_value_and_sum_exps.val2
1206
+ # gradient_k = (p_k - (y_true == k)) * sw
1207
+ gradient_out[k] = (gradient_out[k] - (y_true == k)) * sample_weight
1208
+
1209
+ def _test_cy_gradient(
1210
+ self,
1211
+ const floating_in[::1] y_true, # IN
1212
+ const floating_in[:, ::1] raw_prediction, # IN
1213
+ const floating_in[::1] sample_weight, # IN
1214
+ ):
1215
+ """For testing only."""
1216
+ cdef:
1217
+ int i, k
1218
+ int n_samples = y_true.shape[0]
1219
+ int n_classes = raw_prediction.shape[1]
1220
+ floating_in [:, ::1] gradient_out
1221
+ gradient = np.empty((n_samples, n_classes), dtype=np.float64)
1222
+ gradient_out = gradient
1223
+
1224
+ for i in range(n_samples):
1225
+ self.cy_gradient(
1226
+ y_true=y_true[i],
1227
+ raw_prediction=raw_prediction[i, :],
1228
+ sample_weight=1.0 if sample_weight is None else sample_weight[i],
1229
+ gradient_out=gradient_out[i, :],
1230
+ )
1231
+ return gradient
1232
+
1233
+ # Note that we do not assume memory alignment/contiguity of 2d arrays.
1234
+ # There seems to be little benefit in doing so. Benchmarks proofing the
1235
+ # opposite are welcome.
1236
+ def loss(
1237
+ self,
1238
+ const floating_in[::1] y_true, # IN
1239
+ const floating_in[:, :] raw_prediction, # IN
1240
+ const floating_in[::1] sample_weight, # IN
1241
+ floating_out[::1] loss_out, # OUT
1242
+ int n_threads=1
1243
+ ):
1244
+ cdef:
1245
+ int i, k
1246
+ int n_samples = y_true.shape[0]
1247
+ int n_classes = raw_prediction.shape[1]
1248
+ floating_in max_value, sum_exps
1249
+ floating_in* p # temporary buffer
1250
+ double_pair max_value_and_sum_exps
1251
+
1252
+ # We assume n_samples > n_classes. In this case having the inner loop
1253
+ # over n_classes is a good default.
1254
+ # TODO: If every memoryview is contiguous and raw_prediction is
1255
+ # f-contiguous, can we write a better algo (loops) to improve
1256
+ # performance?
1257
+ if sample_weight is None:
1258
+ # inner loop over n_classes
1259
+ with nogil, parallel(num_threads=n_threads):
1260
+ # Define private buffer variables as each thread might use its
1261
+ # own.
1262
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1263
+
1264
+ for i in prange(n_samples, schedule='static'):
1265
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1266
+ max_value = max_value_and_sum_exps.val1
1267
+ sum_exps = max_value_and_sum_exps.val2
1268
+ loss_out[i] = log(sum_exps) + max_value
1269
+
1270
+ # label encoded y_true
1271
+ k = int(y_true[i])
1272
+ loss_out[i] -= raw_prediction[i, k]
1273
+
1274
+ free(p)
1275
+ else:
1276
+ with nogil, parallel(num_threads=n_threads):
1277
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1278
+
1279
+ for i in prange(n_samples, schedule='static'):
1280
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1281
+ max_value = max_value_and_sum_exps.val1
1282
+ sum_exps = max_value_and_sum_exps.val2
1283
+ loss_out[i] = log(sum_exps) + max_value
1284
+
1285
+ # label encoded y_true
1286
+ k = int(y_true[i])
1287
+ loss_out[i] -= raw_prediction[i, k]
1288
+
1289
+ loss_out[i] *= sample_weight[i]
1290
+
1291
+ free(p)
1292
+
1293
+ def loss_gradient(
1294
+ self,
1295
+ const floating_in[::1] y_true, # IN
1296
+ const floating_in[:, :] raw_prediction, # IN
1297
+ const floating_in[::1] sample_weight, # IN
1298
+ floating_out[::1] loss_out, # OUT
1299
+ floating_out[:, :] gradient_out, # OUT
1300
+ int n_threads=1
1301
+ ):
1302
+ cdef:
1303
+ int i, k
1304
+ int n_samples = y_true.shape[0]
1305
+ int n_classes = raw_prediction.shape[1]
1306
+ floating_in max_value, sum_exps
1307
+ floating_in* p # temporary buffer
1308
+ double_pair max_value_and_sum_exps
1309
+
1310
+ if sample_weight is None:
1311
+ # inner loop over n_classes
1312
+ with nogil, parallel(num_threads=n_threads):
1313
+ # Define private buffer variables as each thread might use its
1314
+ # own.
1315
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1316
+
1317
+ for i in prange(n_samples, schedule='static'):
1318
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1319
+ max_value = max_value_and_sum_exps.val1
1320
+ sum_exps = max_value_and_sum_exps.val2
1321
+ loss_out[i] = log(sum_exps) + max_value
1322
+
1323
+ for k in range(n_classes):
1324
+ # label decode y_true
1325
+ if y_true[i] == k:
1326
+ loss_out[i] -= raw_prediction[i, k]
1327
+ p[k] /= sum_exps # p_k = y_pred_k = prob of class k
1328
+ # gradient_k = p_k - (y_true == k)
1329
+ gradient_out[i, k] = p[k] - (y_true[i] == k)
1330
+
1331
+ free(p)
1332
+ else:
1333
+ with nogil, parallel(num_threads=n_threads):
1334
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1335
+
1336
+ for i in prange(n_samples, schedule='static'):
1337
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1338
+ max_value = max_value_and_sum_exps.val1
1339
+ sum_exps = max_value_and_sum_exps.val2
1340
+ loss_out[i] = log(sum_exps) + max_value
1341
+
1342
+ for k in range(n_classes):
1343
+ # label decode y_true
1344
+ if y_true[i] == k:
1345
+ loss_out[i] -= raw_prediction[i, k]
1346
+ p[k] /= sum_exps # p_k = y_pred_k = prob of class k
1347
+ # gradient_k = (p_k - (y_true == k)) * sw
1348
+ gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
1349
+
1350
+ loss_out[i] *= sample_weight[i]
1351
+
1352
+ free(p)
1353
+
1354
+ def gradient(
1355
+ self,
1356
+ const floating_in[::1] y_true, # IN
1357
+ const floating_in[:, :] raw_prediction, # IN
1358
+ const floating_in[::1] sample_weight, # IN
1359
+ floating_out[:, :] gradient_out, # OUT
1360
+ int n_threads=1
1361
+ ):
1362
+ cdef:
1363
+ int i, k
1364
+ int n_samples = y_true.shape[0]
1365
+ int n_classes = raw_prediction.shape[1]
1366
+ floating_in sum_exps
1367
+ floating_in* p # temporary buffer
1368
+ double_pair max_value_and_sum_exps
1369
+
1370
+ if sample_weight is None:
1371
+ # inner loop over n_classes
1372
+ with nogil, parallel(num_threads=n_threads):
1373
+ # Define private buffer variables as each thread might use its
1374
+ # own.
1375
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1376
+
1377
+ for i in prange(n_samples, schedule='static'):
1378
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1379
+ sum_exps = max_value_and_sum_exps.val2
1380
+
1381
+ for k in range(n_classes):
1382
+ p[k] /= sum_exps # p_k = y_pred_k = prob of class k
1383
+ # gradient_k = y_pred_k - (y_true == k)
1384
+ gradient_out[i, k] = p[k] - (y_true[i] == k)
1385
+
1386
+ free(p)
1387
+ else:
1388
+ with nogil, parallel(num_threads=n_threads):
1389
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1390
+
1391
+ for i in prange(n_samples, schedule='static'):
1392
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1393
+ sum_exps = max_value_and_sum_exps.val2
1394
+
1395
+ for k in range(n_classes):
1396
+ p[k] /= sum_exps # p_k = y_pred_k = prob of class k
1397
+ # gradient_k = (p_k - (y_true == k)) * sw
1398
+ gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
1399
+
1400
+ free(p)
1401
+
1402
+ def gradient_hessian(
1403
+ self,
1404
+ const floating_in[::1] y_true, # IN
1405
+ const floating_in[:, :] raw_prediction, # IN
1406
+ const floating_in[::1] sample_weight, # IN
1407
+ floating_out[:, :] gradient_out, # OUT
1408
+ floating_out[:, :] hessian_out, # OUT
1409
+ int n_threads=1
1410
+ ):
1411
+ cdef:
1412
+ int i, k
1413
+ int n_samples = y_true.shape[0]
1414
+ int n_classes = raw_prediction.shape[1]
1415
+ floating_in sum_exps
1416
+ floating_in* p # temporary buffer
1417
+ double_pair max_value_and_sum_exps
1418
+
1419
+ if sample_weight is None:
1420
+ # inner loop over n_classes
1421
+ with nogil, parallel(num_threads=n_threads):
1422
+ # Define private buffer variables as each thread might use its
1423
+ # own.
1424
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1425
+
1426
+ for i in prange(n_samples, schedule='static'):
1427
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1428
+ sum_exps = max_value_and_sum_exps.val2
1429
+
1430
+ for k in range(n_classes):
1431
+ p[k] /= sum_exps # p_k = y_pred_k = prob of class k
1432
+ # hessian_k = p_k * (1 - p_k)
1433
+ # gradient_k = p_k - (y_true == k)
1434
+ gradient_out[i, k] = p[k] - (y_true[i] == k)
1435
+ hessian_out[i, k] = p[k] * (1. - p[k])
1436
+
1437
+ free(p)
1438
+ else:
1439
+ with nogil, parallel(num_threads=n_threads):
1440
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1441
+
1442
+ for i in prange(n_samples, schedule='static'):
1443
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1444
+ sum_exps = max_value_and_sum_exps.val2
1445
+
1446
+ for k in range(n_classes):
1447
+ p[k] /= sum_exps # p_k = y_pred_k = prob of class k
1448
+ # gradient_k = (p_k - (y_true == k)) * sw
1449
+ # hessian_k = p_k * (1 - p_k) * sw
1450
+ gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
1451
+ hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
1452
+
1453
+ free(p)
1454
+
1455
+ # This method simplifies the implementation of hessp in linear models,
1456
+ # i.e. the matrix-vector product of the full hessian, not only of the
1457
+ # diagonal (in the classes) approximation as implemented above.
1458
+ def gradient_proba(
1459
+ self,
1460
+ const floating_in[::1] y_true, # IN
1461
+ const floating_in[:, :] raw_prediction, # IN
1462
+ const floating_in[::1] sample_weight, # IN
1463
+ floating_out[:, :] gradient_out, # OUT
1464
+ floating_out[:, :] proba_out, # OUT
1465
+ int n_threads=1
1466
+ ):
1467
+ cdef:
1468
+ int i, k
1469
+ int n_samples = y_true.shape[0]
1470
+ int n_classes = raw_prediction.shape[1]
1471
+ floating_in sum_exps
1472
+ floating_in* p # temporary buffer
1473
+ double_pair max_value_and_sum_exps
1474
+
1475
+ if sample_weight is None:
1476
+ # inner loop over n_classes
1477
+ with nogil, parallel(num_threads=n_threads):
1478
+ # Define private buffer variables as each thread might use its
1479
+ # own.
1480
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1481
+
1482
+ for i in prange(n_samples, schedule='static'):
1483
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1484
+ sum_exps = max_value_and_sum_exps.val2
1485
+
1486
+ for k in range(n_classes):
1487
+ proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k
1488
+ # gradient_k = y_pred_k - (y_true == k)
1489
+ gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k)
1490
+
1491
+ free(p)
1492
+ else:
1493
+ with nogil, parallel(num_threads=n_threads):
1494
+ p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
1495
+
1496
+ for i in prange(n_samples, schedule='static'):
1497
+ max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
1498
+ sum_exps = max_value_and_sum_exps.val2
1499
+
1500
+ for k in range(n_classes):
1501
+ proba_out[i, k] = p[k] / sum_exps # y_pred_k = prob of class k
1502
+ # gradient_k = (p_k - (y_true == k)) * sw
1503
+ gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]
1504
+
1505
+ free(p)
.venv/lib/python3.12/site-packages/sklearn/_loss/link.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module contains classes for invertible (and differentiable) link functions.
3
+ """
4
+
5
+ # Authors: The scikit-learn developers
6
+ # SPDX-License-Identifier: BSD-3-Clause
7
+
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass
10
+
11
+ import numpy as np
12
+ from scipy.special import expit, logit
13
+ from scipy.stats import gmean
14
+
15
+ from ..utils.extmath import softmax
16
+
17
+
18
+ @dataclass
19
+ class Interval:
20
+ low: float
21
+ high: float
22
+ low_inclusive: bool
23
+ high_inclusive: bool
24
+
25
+ def __post_init__(self):
26
+ """Check that low <= high"""
27
+ if self.low > self.high:
28
+ raise ValueError(
29
+ f"One must have low <= high; got low={self.low}, high={self.high}."
30
+ )
31
+
32
+ def includes(self, x):
33
+ """Test whether all values of x are in interval range.
34
+
35
+ Parameters
36
+ ----------
37
+ x : ndarray
38
+ Array whose elements are tested to be in interval range.
39
+
40
+ Returns
41
+ -------
42
+ result : bool
43
+ """
44
+ if self.low_inclusive:
45
+ low = np.greater_equal(x, self.low)
46
+ else:
47
+ low = np.greater(x, self.low)
48
+
49
+ if not np.all(low):
50
+ return False
51
+
52
+ if self.high_inclusive:
53
+ high = np.less_equal(x, self.high)
54
+ else:
55
+ high = np.less(x, self.high)
56
+
57
+ # Note: np.all returns numpy.bool_
58
+ return bool(np.all(high))
59
+
60
+
61
+ def _inclusive_low_high(interval, dtype=np.float64):
62
+ """Generate values low and high to be within the interval range.
63
+
64
+ This is used in tests only.
65
+
66
+ Returns
67
+ -------
68
+ low, high : tuple
69
+ The returned values low and high lie within the interval.
70
+ """
71
+ eps = 10 * np.finfo(dtype).eps
72
+ if interval.low == -np.inf:
73
+ low = -1e10
74
+ elif interval.low < 0:
75
+ low = interval.low * (1 - eps) + eps
76
+ else:
77
+ low = interval.low * (1 + eps) + eps
78
+
79
+ if interval.high == np.inf:
80
+ high = 1e10
81
+ elif interval.high < 0:
82
+ high = interval.high * (1 + eps) - eps
83
+ else:
84
+ high = interval.high * (1 - eps) - eps
85
+
86
+ return low, high
87
+
88
+
89
+ class BaseLink(ABC):
90
+ """Abstract base class for differentiable, invertible link functions.
91
+
92
+ Convention:
93
+ - link function g: raw_prediction = g(y_pred)
94
+ - inverse link h: y_pred = h(raw_prediction)
95
+
96
+ For (generalized) linear models, `raw_prediction = X @ coef` is the so
97
+ called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
98
+ conditional (on X) expected value of the target `y_true`.
99
+
100
+ The methods are not implemented as staticmethods in case a link function needs
101
+ parameters.
102
+ """
103
+
104
+ is_multiclass = False # used for testing only
105
+
106
+ # Usually, raw_prediction may be any real number and y_pred is an open
107
+ # interval.
108
+ # interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
109
+ interval_y_pred = Interval(-np.inf, np.inf, False, False)
110
+
111
+ @abstractmethod
112
+ def link(self, y_pred, out=None):
113
+ """Compute the link function g(y_pred).
114
+
115
+ The link function maps (predicted) target values to raw predictions,
116
+ i.e. `g(y_pred) = raw_prediction`.
117
+
118
+ Parameters
119
+ ----------
120
+ y_pred : array
121
+ Predicted target values.
122
+ out : array
123
+ A location into which the result is stored. If provided, it must
124
+ have a shape that the inputs broadcast to. If not provided or None,
125
+ a freshly-allocated array is returned.
126
+
127
+ Returns
128
+ -------
129
+ out : array
130
+ Output array, element-wise link function.
131
+ """
132
+
133
+ @abstractmethod
134
+ def inverse(self, raw_prediction, out=None):
135
+ """Compute the inverse link function h(raw_prediction).
136
+
137
+ The inverse link function maps raw predictions to predicted target
138
+ values, i.e. `h(raw_prediction) = y_pred`.
139
+
140
+ Parameters
141
+ ----------
142
+ raw_prediction : array
143
+ Raw prediction values (in link space).
144
+ out : array
145
+ A location into which the result is stored. If provided, it must
146
+ have a shape that the inputs broadcast to. If not provided or None,
147
+ a freshly-allocated array is returned.
148
+
149
+ Returns
150
+ -------
151
+ out : array
152
+ Output array, element-wise inverse link function.
153
+ """
154
+
155
+
156
+ class IdentityLink(BaseLink):
157
+ """The identity link function g(x)=x."""
158
+
159
+ def link(self, y_pred, out=None):
160
+ if out is not None:
161
+ np.copyto(out, y_pred)
162
+ return out
163
+ else:
164
+ return y_pred
165
+
166
+ inverse = link
167
+
168
+
169
+ class LogLink(BaseLink):
170
+ """The log link function g(x)=log(x)."""
171
+
172
+ interval_y_pred = Interval(0, np.inf, False, False)
173
+
174
+ def link(self, y_pred, out=None):
175
+ return np.log(y_pred, out=out)
176
+
177
+ def inverse(self, raw_prediction, out=None):
178
+ return np.exp(raw_prediction, out=out)
179
+
180
+
181
+ class LogitLink(BaseLink):
182
+ """The logit link function g(x)=logit(x)."""
183
+
184
+ interval_y_pred = Interval(0, 1, False, False)
185
+
186
+ def link(self, y_pred, out=None):
187
+ return logit(y_pred, out=out)
188
+
189
+ def inverse(self, raw_prediction, out=None):
190
+ return expit(raw_prediction, out=out)
191
+
192
+
193
+ class HalfLogitLink(BaseLink):
194
+ """Half the logit link function g(x)=1/2 * logit(x).
195
+
196
+ Used for the exponential loss.
197
+ """
198
+
199
+ interval_y_pred = Interval(0, 1, False, False)
200
+
201
+ def link(self, y_pred, out=None):
202
+ out = logit(y_pred, out=out)
203
+ out *= 0.5
204
+ return out
205
+
206
+ def inverse(self, raw_prediction, out=None):
207
+ return expit(2 * raw_prediction, out)
208
+
209
+
210
+ class MultinomialLogit(BaseLink):
211
+ """The symmetric multinomial logit function.
212
+
213
+ Convention:
214
+ - y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
215
+
216
+ Notes:
217
+ - The inverse link h is the softmax function.
218
+ - The sum is over the second axis, i.e. axis=1 (n_classes).
219
+
220
+ We have to choose additional constraints in order to make
221
+
222
+ y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
223
+
224
+ for n_classes classes identifiable and invertible.
225
+ We choose the symmetric side constraint where the geometric mean response
226
+ is set as reference category, see [2]:
227
+
228
+ The symmetric multinomial logit link function for a single data point is
229
+ then defined as
230
+
231
+ raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
232
+ = log(y_pred[k]) - mean(log(y_pred)).
233
+
234
+ Note that this is equivalent to the definition in [1] and implies mean
235
+ centered raw predictions:
236
+
237
+ sum(raw_prediction[k], k=0..n_classes-1) = 0.
238
+
239
+ For linear models with raw_prediction = X @ coef, this corresponds to
240
+ sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
241
+ feature is zero.
242
+
243
+ Reference
244
+ ---------
245
+ .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
246
+ logistic regression: a statistical view of boosting" Ann. Statist.
247
+ 28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
248
+ https://projecteuclid.org/euclid.aos/1016218223
249
+
250
+ .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
251
+ multinomial logit models with symmetric side constraints."
252
+ Computational Statistics 28 (2013): 1017-1034.
253
+ http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
254
+ """
255
+
256
+ is_multiclass = True
257
+ interval_y_pred = Interval(0, 1, False, False)
258
+
259
+ def symmetrize_raw_prediction(self, raw_prediction):
260
+ return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
261
+
262
+ def link(self, y_pred, out=None):
263
+ # geometric mean as reference category
264
+ gm = gmean(y_pred, axis=1)
265
+ return np.log(y_pred / gm[:, np.newaxis], out=out)
266
+
267
+ def inverse(self, raw_prediction, out=None):
268
+ if out is None:
269
+ return softmax(raw_prediction, copy=True)
270
+ else:
271
+ np.copyto(out, raw_prediction)
272
+ softmax(out, copy=False)
273
+ return out
274
+
275
+
276
+ _LINKS = {
277
+ "identity": IdentityLink,
278
+ "log": LogLink,
279
+ "logit": LogitLink,
280
+ "half_logit": HalfLogitLink,
281
+ "multinomial_logit": MultinomialLogit,
282
+ }
.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py ADDED
@@ -0,0 +1,1181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains loss classes suitable for fitting.
3
+
4
+ It is not part of the public API.
5
+ Specific losses are used for regression, binary classification or multiclass
6
+ classification.
7
+ """
8
+
9
+ # Authors: The scikit-learn developers
10
+ # SPDX-License-Identifier: BSD-3-Clause
11
+
12
+ # Goals:
13
+ # - Provide a common private module for loss functions/classes.
14
+ # - To be used in:
15
+ # - LogisticRegression
16
+ # - PoissonRegressor, GammaRegressor, TweedieRegressor
17
+ # - HistGradientBoostingRegressor, HistGradientBoostingClassifier
18
+ # - GradientBoostingRegressor, GradientBoostingClassifier
19
+ # - SGDRegressor, SGDClassifier
20
+ # - Replace link module of GLMs.
21
+
22
+ import numbers
23
+
24
+ import numpy as np
25
+ from scipy.special import xlogy
26
+
27
+ from ..utils import check_scalar
28
+ from ..utils.stats import _weighted_percentile
29
+ from ._loss import (
30
+ CyAbsoluteError,
31
+ CyExponentialLoss,
32
+ CyHalfBinomialLoss,
33
+ CyHalfGammaLoss,
34
+ CyHalfMultinomialLoss,
35
+ CyHalfPoissonLoss,
36
+ CyHalfSquaredError,
37
+ CyHalfTweedieLoss,
38
+ CyHalfTweedieLossIdentity,
39
+ CyHuberLoss,
40
+ CyPinballLoss,
41
+ )
42
+ from .link import (
43
+ HalfLogitLink,
44
+ IdentityLink,
45
+ Interval,
46
+ LogitLink,
47
+ LogLink,
48
+ MultinomialLogit,
49
+ )
50
+
51
+
52
+ # Note: The shape of raw_prediction for multiclass classifications are
53
+ # - GradientBoostingClassifier: (n_samples, n_classes)
54
+ # - HistGradientBoostingClassifier: (n_classes, n_samples)
55
+ #
56
+ # Note: Instead of inheritance like
57
+ #
58
+ # class BaseLoss(BaseLink, CyLossFunction):
59
+ # ...
60
+ #
61
+ # # Note: Naturally, we would inherit in the following order
62
+ # # class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
63
+ # # But because of https://github.com/cython/cython/issues/4350 we set BaseLoss as
64
+ # # the last one. This, of course, changes the MRO.
65
+ # class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
66
+ #
67
+ # we use composition. This way we improve maintainability by avoiding the above
68
+ # mentioned Cython edge case and have easier to understand code (which method calls
69
+ # which code).
70
+ class BaseLoss:
71
+ """Base class for a loss function of 1-dimensional targets.
72
+
73
+ Conventions:
74
+
75
+ - y_true.shape = sample_weight.shape = (n_samples,)
76
+ - y_pred.shape = raw_prediction.shape = (n_samples,)
77
+ - If is_multiclass is true (multiclass classification), then
78
+ y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
79
+ Note that this corresponds to the return value of decision_function.
80
+
81
+ y_true, y_pred, sample_weight and raw_prediction must either be all float64
82
+ or all float32.
83
+ gradient and hessian must be either both float64 or both float32.
84
+
85
+ Note that y_pred = link.inverse(raw_prediction).
86
+
87
+ Specific loss classes can inherit specific link classes to satisfy
88
+ BaseLink's abstractmethods.
89
+
90
+ Parameters
91
+ ----------
92
+ sample_weight : {None, ndarray}
93
+ If sample_weight is None, the hessian might be constant.
94
+ n_classes : {None, int}
95
+ The number of classes for classification, else None.
96
+
97
+ Attributes
98
+ ----------
99
+ closs: CyLossFunction
100
+ link : BaseLink
101
+ interval_y_true : Interval
102
+ Valid interval for y_true
103
+ interval_y_pred : Interval
104
+ Valid Interval for y_pred
105
+ differentiable : bool
106
+ Indicates whether or not loss function is differentiable in
107
+ raw_prediction everywhere.
108
+ need_update_leaves_values : bool
109
+ Indicates whether decision trees in gradient boosting need to uptade
110
+ leave values after having been fit to the (negative) gradients.
111
+ approx_hessian : bool
112
+ Indicates whether the hessian is approximated or exact. If,
113
+ approximated, it should be larger or equal to the exact one.
114
+ constant_hessian : bool
115
+ Indicates whether the hessian is one for this loss.
116
+ is_multiclass : bool
117
+ Indicates whether n_classes > 2 is allowed.
118
+ """
119
+
120
+ # For gradient boosted decision trees:
121
+ # This variable indicates whether the loss requires the leaves values to
122
+ # be updated once the tree has been trained. The trees are trained to
123
+ # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
124
+ # some losses (e.g. least absolute deviation) we need to adjust the tree
125
+ # values to account for the "line search" of the gradient descent
126
+ # procedure. See the original paper Greedy Function Approximation: A
127
+ # Gradient Boosting Machine by Friedman
128
+ # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
129
+ differentiable = True
130
+ need_update_leaves_values = False
131
+ is_multiclass = False
132
+
133
+ def __init__(self, closs, link, n_classes=None):
134
+ self.closs = closs
135
+ self.link = link
136
+ self.approx_hessian = False
137
+ self.constant_hessian = False
138
+ self.n_classes = n_classes
139
+ self.interval_y_true = Interval(-np.inf, np.inf, False, False)
140
+ self.interval_y_pred = self.link.interval_y_pred
141
+
142
+ def in_y_true_range(self, y):
143
+ """Return True if y is in the valid range of y_true.
144
+
145
+ Parameters
146
+ ----------
147
+ y : ndarray
148
+ """
149
+ return self.interval_y_true.includes(y)
150
+
151
+ def in_y_pred_range(self, y):
152
+ """Return True if y is in the valid range of y_pred.
153
+
154
+ Parameters
155
+ ----------
156
+ y : ndarray
157
+ """
158
+ return self.interval_y_pred.includes(y)
159
+
160
+ def loss(
161
+ self,
162
+ y_true,
163
+ raw_prediction,
164
+ sample_weight=None,
165
+ loss_out=None,
166
+ n_threads=1,
167
+ ):
168
+ """Compute the pointwise loss value for each input.
169
+
170
+ Parameters
171
+ ----------
172
+ y_true : C-contiguous array of shape (n_samples,)
173
+ Observed, true target values.
174
+ raw_prediction : C-contiguous array of shape (n_samples,) or array of \
175
+ shape (n_samples, n_classes)
176
+ Raw prediction values (in link space).
177
+ sample_weight : None or C-contiguous array of shape (n_samples,)
178
+ Sample weights.
179
+ loss_out : None or C-contiguous array of shape (n_samples,)
180
+ A location into which the result is stored. If None, a new array
181
+ might be created.
182
+ n_threads : int, default=1
183
+ Might use openmp thread parallelism.
184
+
185
+ Returns
186
+ -------
187
+ loss : array of shape (n_samples,)
188
+ Element-wise loss function.
189
+ """
190
+ if loss_out is None:
191
+ loss_out = np.empty_like(y_true)
192
+ # Be graceful to shape (n_samples, 1) -> (n_samples,)
193
+ if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
194
+ raw_prediction = raw_prediction.squeeze(1)
195
+
196
+ self.closs.loss(
197
+ y_true=y_true,
198
+ raw_prediction=raw_prediction,
199
+ sample_weight=sample_weight,
200
+ loss_out=loss_out,
201
+ n_threads=n_threads,
202
+ )
203
+ return loss_out
204
+
205
+ def loss_gradient(
206
+ self,
207
+ y_true,
208
+ raw_prediction,
209
+ sample_weight=None,
210
+ loss_out=None,
211
+ gradient_out=None,
212
+ n_threads=1,
213
+ ):
214
+ """Compute loss and gradient w.r.t. raw_prediction for each input.
215
+
216
+ Parameters
217
+ ----------
218
+ y_true : C-contiguous array of shape (n_samples,)
219
+ Observed, true target values.
220
+ raw_prediction : C-contiguous array of shape (n_samples,) or array of \
221
+ shape (n_samples, n_classes)
222
+ Raw prediction values (in link space).
223
+ sample_weight : None or C-contiguous array of shape (n_samples,)
224
+ Sample weights.
225
+ loss_out : None or C-contiguous array of shape (n_samples,)
226
+ A location into which the loss is stored. If None, a new array
227
+ might be created.
228
+ gradient_out : None or C-contiguous array of shape (n_samples,) or array \
229
+ of shape (n_samples, n_classes)
230
+ A location into which the gradient is stored. If None, a new array
231
+ might be created.
232
+ n_threads : int, default=1
233
+ Might use openmp thread parallelism.
234
+
235
+ Returns
236
+ -------
237
+ loss : array of shape (n_samples,)
238
+ Element-wise loss function.
239
+
240
+ gradient : array of shape (n_samples,) or (n_samples, n_classes)
241
+ Element-wise gradients.
242
+ """
243
+ if loss_out is None:
244
+ if gradient_out is None:
245
+ loss_out = np.empty_like(y_true)
246
+ gradient_out = np.empty_like(raw_prediction)
247
+ else:
248
+ loss_out = np.empty_like(y_true, dtype=gradient_out.dtype)
249
+ elif gradient_out is None:
250
+ gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype)
251
+
252
+ # Be graceful to shape (n_samples, 1) -> (n_samples,)
253
+ if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
254
+ raw_prediction = raw_prediction.squeeze(1)
255
+ if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
256
+ gradient_out = gradient_out.squeeze(1)
257
+
258
+ self.closs.loss_gradient(
259
+ y_true=y_true,
260
+ raw_prediction=raw_prediction,
261
+ sample_weight=sample_weight,
262
+ loss_out=loss_out,
263
+ gradient_out=gradient_out,
264
+ n_threads=n_threads,
265
+ )
266
+ return loss_out, gradient_out
267
+
268
+ def gradient(
269
+ self,
270
+ y_true,
271
+ raw_prediction,
272
+ sample_weight=None,
273
+ gradient_out=None,
274
+ n_threads=1,
275
+ ):
276
+ """Compute gradient of loss w.r.t raw_prediction for each input.
277
+
278
+ Parameters
279
+ ----------
280
+ y_true : C-contiguous array of shape (n_samples,)
281
+ Observed, true target values.
282
+ raw_prediction : C-contiguous array of shape (n_samples,) or array of \
283
+ shape (n_samples, n_classes)
284
+ Raw prediction values (in link space).
285
+ sample_weight : None or C-contiguous array of shape (n_samples,)
286
+ Sample weights.
287
+ gradient_out : None or C-contiguous array of shape (n_samples,) or array \
288
+ of shape (n_samples, n_classes)
289
+ A location into which the result is stored. If None, a new array
290
+ might be created.
291
+ n_threads : int, default=1
292
+ Might use openmp thread parallelism.
293
+
294
+ Returns
295
+ -------
296
+ gradient : array of shape (n_samples,) or (n_samples, n_classes)
297
+ Element-wise gradients.
298
+ """
299
+ if gradient_out is None:
300
+ gradient_out = np.empty_like(raw_prediction)
301
+
302
+ # Be graceful to shape (n_samples, 1) -> (n_samples,)
303
+ if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
304
+ raw_prediction = raw_prediction.squeeze(1)
305
+ if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
306
+ gradient_out = gradient_out.squeeze(1)
307
+
308
+ self.closs.gradient(
309
+ y_true=y_true,
310
+ raw_prediction=raw_prediction,
311
+ sample_weight=sample_weight,
312
+ gradient_out=gradient_out,
313
+ n_threads=n_threads,
314
+ )
315
+ return gradient_out
316
+
317
+ def gradient_hessian(
318
+ self,
319
+ y_true,
320
+ raw_prediction,
321
+ sample_weight=None,
322
+ gradient_out=None,
323
+ hessian_out=None,
324
+ n_threads=1,
325
+ ):
326
+ """Compute gradient and hessian of loss w.r.t raw_prediction.
327
+
328
+ Parameters
329
+ ----------
330
+ y_true : C-contiguous array of shape (n_samples,)
331
+ Observed, true target values.
332
+ raw_prediction : C-contiguous array of shape (n_samples,) or array of \
333
+ shape (n_samples, n_classes)
334
+ Raw prediction values (in link space).
335
+ sample_weight : None or C-contiguous array of shape (n_samples,)
336
+ Sample weights.
337
+ gradient_out : None or C-contiguous array of shape (n_samples,) or array \
338
+ of shape (n_samples, n_classes)
339
+ A location into which the gradient is stored. If None, a new array
340
+ might be created.
341
+ hessian_out : None or C-contiguous array of shape (n_samples,) or array \
342
+ of shape (n_samples, n_classes)
343
+ A location into which the hessian is stored. If None, a new array
344
+ might be created.
345
+ n_threads : int, default=1
346
+ Might use openmp thread parallelism.
347
+
348
+ Returns
349
+ -------
350
+ gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
351
+ Element-wise gradients.
352
+
353
+ hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
354
+ Element-wise hessians.
355
+ """
356
+ if gradient_out is None:
357
+ if hessian_out is None:
358
+ gradient_out = np.empty_like(raw_prediction)
359
+ hessian_out = np.empty_like(raw_prediction)
360
+ else:
361
+ gradient_out = np.empty_like(hessian_out)
362
+ elif hessian_out is None:
363
+ hessian_out = np.empty_like(gradient_out)
364
+
365
+ # Be graceful to shape (n_samples, 1) -> (n_samples,)
366
+ if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
367
+ raw_prediction = raw_prediction.squeeze(1)
368
+ if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
369
+ gradient_out = gradient_out.squeeze(1)
370
+ if hessian_out.ndim == 2 and hessian_out.shape[1] == 1:
371
+ hessian_out = hessian_out.squeeze(1)
372
+
373
+ self.closs.gradient_hessian(
374
+ y_true=y_true,
375
+ raw_prediction=raw_prediction,
376
+ sample_weight=sample_weight,
377
+ gradient_out=gradient_out,
378
+ hessian_out=hessian_out,
379
+ n_threads=n_threads,
380
+ )
381
+ return gradient_out, hessian_out
382
+
383
+ def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
384
+ """Compute the weighted average loss.
385
+
386
+ Parameters
387
+ ----------
388
+ y_true : C-contiguous array of shape (n_samples,)
389
+ Observed, true target values.
390
+ raw_prediction : C-contiguous array of shape (n_samples,) or array of \
391
+ shape (n_samples, n_classes)
392
+ Raw prediction values (in link space).
393
+ sample_weight : None or C-contiguous array of shape (n_samples,)
394
+ Sample weights.
395
+ n_threads : int, default=1
396
+ Might use openmp thread parallelism.
397
+
398
+ Returns
399
+ -------
400
+ loss : float
401
+ Mean or averaged loss function.
402
+ """
403
+ return np.average(
404
+ self.loss(
405
+ y_true=y_true,
406
+ raw_prediction=raw_prediction,
407
+ sample_weight=None,
408
+ loss_out=None,
409
+ n_threads=n_threads,
410
+ ),
411
+ weights=sample_weight,
412
+ )
413
+
414
+ def fit_intercept_only(self, y_true, sample_weight=None):
415
+ """Compute raw_prediction of an intercept-only model.
416
+
417
+ This can be used as initial estimates of predictions, i.e. before the
418
+ first iteration in fit.
419
+
420
+ Parameters
421
+ ----------
422
+ y_true : array-like of shape (n_samples,)
423
+ Observed, true target values.
424
+ sample_weight : None or array of shape (n_samples,)
425
+ Sample weights.
426
+
427
+ Returns
428
+ -------
429
+ raw_prediction : numpy scalar or array of shape (n_classes,)
430
+ Raw predictions of an intercept-only model.
431
+ """
432
+ # As default, take weighted average of the target over the samples
433
+ # axis=0 and then transform into link-scale (raw_prediction).
434
+ y_pred = np.average(y_true, weights=sample_weight, axis=0)
435
+ eps = 10 * np.finfo(y_pred.dtype).eps
436
+
437
+ if self.interval_y_pred.low == -np.inf:
438
+ a_min = None
439
+ elif self.interval_y_pred.low_inclusive:
440
+ a_min = self.interval_y_pred.low
441
+ else:
442
+ a_min = self.interval_y_pred.low + eps
443
+
444
+ if self.interval_y_pred.high == np.inf:
445
+ a_max = None
446
+ elif self.interval_y_pred.high_inclusive:
447
+ a_max = self.interval_y_pred.high
448
+ else:
449
+ a_max = self.interval_y_pred.high - eps
450
+
451
+ if a_min is None and a_max is None:
452
+ return self.link.link(y_pred)
453
+ else:
454
+ return self.link.link(np.clip(y_pred, a_min, a_max))
455
+
456
+ def constant_to_optimal_zero(self, y_true, sample_weight=None):
457
+ """Calculate term dropped in loss.
458
+
459
+ With this term added, the loss of perfect predictions is zero.
460
+ """
461
+ return np.zeros_like(y_true)
462
+
463
+ def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
464
+ """Initialize arrays for gradients and hessians.
465
+
466
+ Unless hessians are constant, arrays are initialized with undefined values.
467
+
468
+ Parameters
469
+ ----------
470
+ n_samples : int
471
+ The number of samples, usually passed to `fit()`.
472
+ dtype : {np.float64, np.float32}, default=np.float64
473
+ The dtype of the arrays gradient and hessian.
474
+ order : {'C', 'F'}, default='F'
475
+ Order of the arrays gradient and hessian. The default 'F' makes the arrays
476
+ contiguous along samples.
477
+
478
+ Returns
479
+ -------
480
+ gradient : C-contiguous array of shape (n_samples,) or array of shape \
481
+ (n_samples, n_classes)
482
+ Empty array (allocated but not initialized) to be used as argument
483
+ gradient_out.
484
+ hessian : C-contiguous array of shape (n_samples,), array of shape
485
+ (n_samples, n_classes) or shape (1,)
486
+ Empty (allocated but not initialized) array to be used as argument
487
+ hessian_out.
488
+ If constant_hessian is True (e.g. `HalfSquaredError`), the array is
489
+ initialized to ``1``.
490
+ """
491
+ if dtype not in (np.float32, np.float64):
492
+ raise ValueError(
493
+ "Valid options for 'dtype' are np.float32 and np.float64. "
494
+ f"Got dtype={dtype} instead."
495
+ )
496
+
497
+ if self.is_multiclass:
498
+ shape = (n_samples, self.n_classes)
499
+ else:
500
+ shape = (n_samples,)
501
+ gradient = np.empty(shape=shape, dtype=dtype, order=order)
502
+
503
+ if self.constant_hessian:
504
+ # If the hessians are constant, we consider them equal to 1.
505
+ # - This is correct for HalfSquaredError
506
+ # - For AbsoluteError, hessians are actually 0, but they are
507
+ # always ignored anyway.
508
+ hessian = np.ones(shape=(1,), dtype=dtype)
509
+ else:
510
+ hessian = np.empty(shape=shape, dtype=dtype, order=order)
511
+
512
+ return gradient, hessian
513
+
514
+
515
+ # Note: Naturally, we would inherit in the following order
516
+ # class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
517
+ # But because of https://github.com/cython/cython/issues/4350 we
518
+ # set BaseLoss as the last one. This, of course, changes the MRO.
519
+ class HalfSquaredError(BaseLoss):
520
+ """Half squared error with identity link, for regression.
521
+
522
+ Domain:
523
+ y_true and y_pred all real numbers
524
+
525
+ Link:
526
+ y_pred = raw_prediction
527
+
528
+ For a given sample x_i, half squared error is defined as::
529
+
530
+ loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
531
+
532
+ The factor of 0.5 simplifies the computation of gradients and results in a
533
+ unit hessian (and is consistent with what is done in LightGBM). It is also
534
+ half the Normal distribution deviance.
535
+ """
536
+
537
+ def __init__(self, sample_weight=None):
538
+ super().__init__(closs=CyHalfSquaredError(), link=IdentityLink())
539
+ self.constant_hessian = sample_weight is None
540
+
541
+
542
+ class AbsoluteError(BaseLoss):
543
+ """Absolute error with identity link, for regression.
544
+
545
+ Domain:
546
+ y_true and y_pred all real numbers
547
+
548
+ Link:
549
+ y_pred = raw_prediction
550
+
551
+ For a given sample x_i, the absolute error is defined as::
552
+
553
+ loss(x_i) = |y_true_i - raw_prediction_i|
554
+
555
+ Note that the exact hessian = 0 almost everywhere (except at one point, therefore
556
+ differentiable = False). Optimization routines like in HGBT, however, need a
557
+ hessian > 0. Therefore, we assign 1.
558
+ """
559
+
560
+ differentiable = False
561
+ need_update_leaves_values = True
562
+
563
+ def __init__(self, sample_weight=None):
564
+ super().__init__(closs=CyAbsoluteError(), link=IdentityLink())
565
+ self.approx_hessian = True
566
+ self.constant_hessian = sample_weight is None
567
+
568
+ def fit_intercept_only(self, y_true, sample_weight=None):
569
+ """Compute raw_prediction of an intercept-only model.
570
+
571
+ This is the weighted median of the target, i.e. over the samples
572
+ axis=0.
573
+ """
574
+ if sample_weight is None:
575
+ return np.median(y_true, axis=0)
576
+ else:
577
+ return _weighted_percentile(y_true, sample_weight, 50)
578
+
579
+
580
+ class PinballLoss(BaseLoss):
581
+ """Quantile loss aka pinball loss, for regression.
582
+
583
+ Domain:
584
+ y_true and y_pred all real numbers
585
+ quantile in (0, 1)
586
+
587
+ Link:
588
+ y_pred = raw_prediction
589
+
590
+ For a given sample x_i, the pinball loss is defined as::
591
+
592
+ loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
593
+
594
+ rho_{quantile}(u) = u * (quantile - 1_{u<0})
595
+ = -u *(1 - quantile) if u < 0
596
+ u * quantile if u >= 0
597
+
598
+ Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
599
+
600
+ Note that the exact hessian = 0 almost everywhere (except at one point, therefore
601
+ differentiable = False). Optimization routines like in HGBT, however, need a
602
+ hessian > 0. Therefore, we assign 1.
603
+
604
+ Additional Attributes
605
+ ---------------------
606
+ quantile : float
607
+ The quantile level of the quantile to be estimated. Must be in range (0, 1).
608
+ """
609
+
610
+ differentiable = False
611
+ need_update_leaves_values = True
612
+
613
+ def __init__(self, sample_weight=None, quantile=0.5):
614
+ check_scalar(
615
+ quantile,
616
+ "quantile",
617
+ target_type=numbers.Real,
618
+ min_val=0,
619
+ max_val=1,
620
+ include_boundaries="neither",
621
+ )
622
+ super().__init__(
623
+ closs=CyPinballLoss(quantile=float(quantile)),
624
+ link=IdentityLink(),
625
+ )
626
+ self.approx_hessian = True
627
+ self.constant_hessian = sample_weight is None
628
+
629
+ def fit_intercept_only(self, y_true, sample_weight=None):
630
+ """Compute raw_prediction of an intercept-only model.
631
+
632
+ This is the weighted median of the target, i.e. over the samples
633
+ axis=0.
634
+ """
635
+ if sample_weight is None:
636
+ return np.percentile(y_true, 100 * self.closs.quantile, axis=0)
637
+ else:
638
+ return _weighted_percentile(
639
+ y_true, sample_weight, 100 * self.closs.quantile
640
+ )
641
+
642
+
643
+ class HuberLoss(BaseLoss):
644
+ """Huber loss, for regression.
645
+
646
+ Domain:
647
+ y_true and y_pred all real numbers
648
+ quantile in (0, 1)
649
+
650
+ Link:
651
+ y_pred = raw_prediction
652
+
653
+ For a given sample x_i, the Huber loss is defined as::
654
+
655
+ loss(x_i) = 1/2 * abserr**2 if abserr <= delta
656
+ delta * (abserr - delta/2) if abserr > delta
657
+
658
+ abserr = |y_true_i - raw_prediction_i|
659
+ delta = quantile(abserr, self.quantile)
660
+
661
+ Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
662
+ equals delta * (AbsoluteError() - delta/2).
663
+
664
+ Additional Attributes
665
+ ---------------------
666
+ quantile : float
667
+ The quantile level which defines the breaking point `delta` to distinguish
668
+ between absolute error and squared error. Must be in range (0, 1).
669
+
670
+ Reference
671
+ ---------
672
+ .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
673
+ boosting machine <10.1214/aos/1013203451>`.
674
+ Annals of Statistics, 29, 1189-1232.
675
+ """
676
+
677
+ differentiable = False
678
+ need_update_leaves_values = True
679
+
680
+ def __init__(self, sample_weight=None, quantile=0.9, delta=0.5):
681
+ check_scalar(
682
+ quantile,
683
+ "quantile",
684
+ target_type=numbers.Real,
685
+ min_val=0,
686
+ max_val=1,
687
+ include_boundaries="neither",
688
+ )
689
+ self.quantile = quantile # This is better stored outside of Cython.
690
+ super().__init__(
691
+ closs=CyHuberLoss(delta=float(delta)),
692
+ link=IdentityLink(),
693
+ )
694
+ self.approx_hessian = True
695
+ self.constant_hessian = False
696
+
697
+ def fit_intercept_only(self, y_true, sample_weight=None):
698
+ """Compute raw_prediction of an intercept-only model.
699
+
700
+ This is the weighted median of the target, i.e. over the samples
701
+ axis=0.
702
+ """
703
+ # See formula before algo 4 in Friedman (2001), but we apply it to y_true,
704
+ # not to the residual y_true - raw_prediction. An estimator like
705
+ # HistGradientBoostingRegressor might then call it on the residual, e.g.
706
+ # fit_intercept_only(y_true - raw_prediction).
707
+ if sample_weight is None:
708
+ median = np.percentile(y_true, 50, axis=0)
709
+ else:
710
+ median = _weighted_percentile(y_true, sample_weight, 50)
711
+ diff = y_true - median
712
+ term = np.sign(diff) * np.minimum(self.closs.delta, np.abs(diff))
713
+ return median + np.average(term, weights=sample_weight)
714
+
715
+
716
+ class HalfPoissonLoss(BaseLoss):
717
+ """Half Poisson deviance loss with log-link, for regression.
718
+
719
+ Domain:
720
+ y_true in non-negative real numbers
721
+ y_pred in positive real numbers
722
+
723
+ Link:
724
+ y_pred = exp(raw_prediction)
725
+
726
+ For a given sample x_i, half the Poisson deviance is defined as::
727
+
728
+ loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
729
+ - y_true_i + exp(raw_prediction_i)
730
+
731
+ Half the Poisson deviance is actually the negative log-likelihood up to
732
+ constant terms (not involving raw_prediction) and simplifies the
733
+ computation of the gradients.
734
+ We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
735
+ """
736
+
737
+ def __init__(self, sample_weight=None):
738
+ super().__init__(closs=CyHalfPoissonLoss(), link=LogLink())
739
+ self.interval_y_true = Interval(0, np.inf, True, False)
740
+
741
+ def constant_to_optimal_zero(self, y_true, sample_weight=None):
742
+ term = xlogy(y_true, y_true) - y_true
743
+ if sample_weight is not None:
744
+ term *= sample_weight
745
+ return term
746
+
747
+
748
+ class HalfGammaLoss(BaseLoss):
749
+ """Half Gamma deviance loss with log-link, for regression.
750
+
751
+ Domain:
752
+ y_true and y_pred in positive real numbers
753
+
754
+ Link:
755
+ y_pred = exp(raw_prediction)
756
+
757
+ For a given sample x_i, half Gamma deviance loss is defined as::
758
+
759
+ loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
760
+ + y_true/exp(raw_prediction_i) - 1
761
+
762
+ Half the Gamma deviance is actually proportional to the negative log-
763
+ likelihood up to constant terms (not involving raw_prediction) and
764
+ simplifies the computation of the gradients.
765
+ We also skip the constant term `-log(y_true_i) - 1`.
766
+ """
767
+
768
+ def __init__(self, sample_weight=None):
769
+ super().__init__(closs=CyHalfGammaLoss(), link=LogLink())
770
+ self.interval_y_true = Interval(0, np.inf, False, False)
771
+
772
+ def constant_to_optimal_zero(self, y_true, sample_weight=None):
773
+ term = -np.log(y_true) - 1
774
+ if sample_weight is not None:
775
+ term *= sample_weight
776
+ return term
777
+
778
+
779
+ class HalfTweedieLoss(BaseLoss):
780
+ """Half Tweedie deviance loss with log-link, for regression.
781
+
782
+ Domain:
783
+ y_true in real numbers for power <= 0
784
+ y_true in non-negative real numbers for 0 < power < 2
785
+ y_true in positive real numbers for 2 <= power
786
+ y_pred in positive real numbers
787
+ power in real numbers
788
+
789
+ Link:
790
+ y_pred = exp(raw_prediction)
791
+
792
+ For a given sample x_i, half Tweedie deviance loss with p=power is defined
793
+ as::
794
+
795
+ loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
796
+ - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
797
+ + exp(raw_prediction_i)**(2-p) / (2-p)
798
+
799
+ Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
800
+ HalfPoissonLoss and HalfGammaLoss.
801
+
802
+ We also skip constant terms, but those are different for p=0, 1, 2.
803
+ Therefore, the loss is not continuous in `power`.
804
+
805
+ Note furthermore that although no Tweedie distribution exists for
806
+ 0 < power < 1, it still gives a strictly consistent scoring function for
807
+ the expectation.
808
+ """
809
+
810
+ def __init__(self, sample_weight=None, power=1.5):
811
+ super().__init__(
812
+ closs=CyHalfTweedieLoss(power=float(power)),
813
+ link=LogLink(),
814
+ )
815
+ if self.closs.power <= 0:
816
+ self.interval_y_true = Interval(-np.inf, np.inf, False, False)
817
+ elif self.closs.power < 2:
818
+ self.interval_y_true = Interval(0, np.inf, True, False)
819
+ else:
820
+ self.interval_y_true = Interval(0, np.inf, False, False)
821
+
822
+ def constant_to_optimal_zero(self, y_true, sample_weight=None):
823
+ if self.closs.power == 0:
824
+ return HalfSquaredError().constant_to_optimal_zero(
825
+ y_true=y_true, sample_weight=sample_weight
826
+ )
827
+ elif self.closs.power == 1:
828
+ return HalfPoissonLoss().constant_to_optimal_zero(
829
+ y_true=y_true, sample_weight=sample_weight
830
+ )
831
+ elif self.closs.power == 2:
832
+ return HalfGammaLoss().constant_to_optimal_zero(
833
+ y_true=y_true, sample_weight=sample_weight
834
+ )
835
+ else:
836
+ p = self.closs.power
837
+ term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
838
+ if sample_weight is not None:
839
+ term *= sample_weight
840
+ return term
841
+
842
+
843
+ class HalfTweedieLossIdentity(BaseLoss):
844
+ """Half Tweedie deviance loss with identity link, for regression.
845
+
846
+ Domain:
847
+ y_true in real numbers for power <= 0
848
+ y_true in non-negative real numbers for 0 < power < 2
849
+ y_true in positive real numbers for 2 <= power
850
+ y_pred in positive real numbers for power != 0
851
+ y_pred in real numbers for power = 0
852
+ power in real numbers
853
+
854
+ Link:
855
+ y_pred = raw_prediction
856
+
857
+ For a given sample x_i, half Tweedie deviance loss with p=power is defined
858
+ as::
859
+
860
+ loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
861
+ - y_true_i * raw_prediction_i**(1-p) / (1-p)
862
+ + raw_prediction_i**(2-p) / (2-p)
863
+
864
+ Note that the minimum value of this loss is 0.
865
+
866
+ Note furthermore that although no Tweedie distribution exists for
867
+ 0 < power < 1, it still gives a strictly consistent scoring function for
868
+ the expectation.
869
+ """
870
+
871
+ def __init__(self, sample_weight=None, power=1.5):
872
+ super().__init__(
873
+ closs=CyHalfTweedieLossIdentity(power=float(power)),
874
+ link=IdentityLink(),
875
+ )
876
+ if self.closs.power <= 0:
877
+ self.interval_y_true = Interval(-np.inf, np.inf, False, False)
878
+ elif self.closs.power < 2:
879
+ self.interval_y_true = Interval(0, np.inf, True, False)
880
+ else:
881
+ self.interval_y_true = Interval(0, np.inf, False, False)
882
+
883
+ if self.closs.power == 0:
884
+ self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
885
+ else:
886
+ self.interval_y_pred = Interval(0, np.inf, False, False)
887
+
888
+
889
+ class HalfBinomialLoss(BaseLoss):
890
+ """Half Binomial deviance loss with logit link, for binary classification.
891
+
892
+ This is also know as binary cross entropy, log-loss and logistic loss.
893
+
894
+ Domain:
895
+ y_true in [0, 1], i.e. regression on the unit interval
896
+ y_pred in (0, 1), i.e. boundaries excluded
897
+
898
+ Link:
899
+ y_pred = expit(raw_prediction)
900
+
901
+ For a given sample x_i, half Binomial deviance is defined as the negative
902
+ log-likelihood of the Binomial/Bernoulli distribution and can be expressed
903
+ as::
904
+
905
+ loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
906
+
907
+ See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
908
+ section 4.4.1 (about logistic regression).
909
+
910
+ Note that the formulation works for classification, y = {0, 1}, as well as
911
+ logistic regression, y = [0, 1].
912
+ If you add `constant_to_optimal_zero` to the loss, you get half the
913
+ Bernoulli/binomial deviance.
914
+
915
+ More details: Inserting the predicted probability y_pred = expit(raw_prediction)
916
+ in the loss gives the well known::
917
+
918
+ loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
919
+ """
920
+
921
+ def __init__(self, sample_weight=None):
922
+ super().__init__(
923
+ closs=CyHalfBinomialLoss(),
924
+ link=LogitLink(),
925
+ n_classes=2,
926
+ )
927
+ self.interval_y_true = Interval(0, 1, True, True)
928
+
929
+ def constant_to_optimal_zero(self, y_true, sample_weight=None):
930
+ # This is non-zero only if y_true is neither 0 nor 1.
931
+ term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true)
932
+ if sample_weight is not None:
933
+ term *= sample_weight
934
+ return term
935
+
936
+ def predict_proba(self, raw_prediction):
937
+ """Predict probabilities.
938
+
939
+ Parameters
940
+ ----------
941
+ raw_prediction : array of shape (n_samples,) or (n_samples, 1)
942
+ Raw prediction values (in link space).
943
+
944
+ Returns
945
+ -------
946
+ proba : array of shape (n_samples, 2)
947
+ Element-wise class probabilities.
948
+ """
949
+ # Be graceful to shape (n_samples, 1) -> (n_samples,)
950
+ if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
951
+ raw_prediction = raw_prediction.squeeze(1)
952
+ proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
953
+ proba[:, 1] = self.link.inverse(raw_prediction)
954
+ proba[:, 0] = 1 - proba[:, 1]
955
+ return proba
956
+
957
+
958
+ class HalfMultinomialLoss(BaseLoss):
959
+ """Categorical cross-entropy loss, for multiclass classification.
960
+
961
+ Domain:
962
+ y_true in {0, 1, 2, 3, .., n_classes - 1}
963
+ y_pred has n_classes elements, each element in (0, 1)
964
+
965
+ Link:
966
+ y_pred = softmax(raw_prediction)
967
+
968
+ Note: We assume y_true to be already label encoded. The inverse link is
969
+ softmax. But the full link function is the symmetric multinomial logit
970
+ function.
971
+
972
+ For a given sample x_i, the categorical cross-entropy loss is defined as
973
+ the negative log-likelihood of the multinomial distribution, it
974
+ generalizes the binary cross-entropy to more than 2 classes::
975
+
976
+ loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
977
+ - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
978
+
979
+ See [1].
980
+
981
+ Note that for the hessian, we calculate only the diagonal part in the
982
+ classes: If the full hessian for classes k and l and sample i is H_i_k_l,
983
+ we calculate H_i_k_k, i.e. k=l.
984
+
985
+ Reference
986
+ ---------
987
+ .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
988
+ "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
989
+ Multinomial Regression".
990
+ <1311.6529>`
991
+ """
992
+
993
+ is_multiclass = True
994
+
995
+ def __init__(self, sample_weight=None, n_classes=3):
996
+ super().__init__(
997
+ closs=CyHalfMultinomialLoss(),
998
+ link=MultinomialLogit(),
999
+ n_classes=n_classes,
1000
+ )
1001
+ self.interval_y_true = Interval(0, np.inf, True, False)
1002
+ self.interval_y_pred = Interval(0, 1, False, False)
1003
+
1004
+ def in_y_true_range(self, y):
1005
+ """Return True if y is in the valid range of y_true.
1006
+
1007
+ Parameters
1008
+ ----------
1009
+ y : ndarray
1010
+ """
1011
+ return self.interval_y_true.includes(y) and np.all(y.astype(int) == y)
1012
+
1013
+ def fit_intercept_only(self, y_true, sample_weight=None):
1014
+ """Compute raw_prediction of an intercept-only model.
1015
+
1016
+ This is the softmax of the weighted average of the target, i.e. over
1017
+ the samples axis=0.
1018
+ """
1019
+ out = np.zeros(self.n_classes, dtype=y_true.dtype)
1020
+ eps = np.finfo(y_true.dtype).eps
1021
+ for k in range(self.n_classes):
1022
+ out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
1023
+ out[k] = np.clip(out[k], eps, 1 - eps)
1024
+ return self.link.link(out[None, :]).reshape(-1)
1025
+
1026
+ def predict_proba(self, raw_prediction):
1027
+ """Predict probabilities.
1028
+
1029
+ Parameters
1030
+ ----------
1031
+ raw_prediction : array of shape (n_samples, n_classes)
1032
+ Raw prediction values (in link space).
1033
+
1034
+ Returns
1035
+ -------
1036
+ proba : array of shape (n_samples, n_classes)
1037
+ Element-wise class probabilities.
1038
+ """
1039
+ return self.link.inverse(raw_prediction)
1040
+
1041
+ def gradient_proba(
1042
+ self,
1043
+ y_true,
1044
+ raw_prediction,
1045
+ sample_weight=None,
1046
+ gradient_out=None,
1047
+ proba_out=None,
1048
+ n_threads=1,
1049
+ ):
1050
+ """Compute gradient and class probabilities fow raw_prediction.
1051
+
1052
+ Parameters
1053
+ ----------
1054
+ y_true : C-contiguous array of shape (n_samples,)
1055
+ Observed, true target values.
1056
+ raw_prediction : array of shape (n_samples, n_classes)
1057
+ Raw prediction values (in link space).
1058
+ sample_weight : None or C-contiguous array of shape (n_samples,)
1059
+ Sample weights.
1060
+ gradient_out : None or array of shape (n_samples, n_classes)
1061
+ A location into which the gradient is stored. If None, a new array
1062
+ might be created.
1063
+ proba_out : None or array of shape (n_samples, n_classes)
1064
+ A location into which the class probabilities are stored. If None,
1065
+ a new array might be created.
1066
+ n_threads : int, default=1
1067
+ Might use openmp thread parallelism.
1068
+
1069
+ Returns
1070
+ -------
1071
+ gradient : array of shape (n_samples, n_classes)
1072
+ Element-wise gradients.
1073
+
1074
+ proba : array of shape (n_samples, n_classes)
1075
+ Element-wise class probabilities.
1076
+ """
1077
+ if gradient_out is None:
1078
+ if proba_out is None:
1079
+ gradient_out = np.empty_like(raw_prediction)
1080
+ proba_out = np.empty_like(raw_prediction)
1081
+ else:
1082
+ gradient_out = np.empty_like(proba_out)
1083
+ elif proba_out is None:
1084
+ proba_out = np.empty_like(gradient_out)
1085
+
1086
+ self.closs.gradient_proba(
1087
+ y_true=y_true,
1088
+ raw_prediction=raw_prediction,
1089
+ sample_weight=sample_weight,
1090
+ gradient_out=gradient_out,
1091
+ proba_out=proba_out,
1092
+ n_threads=n_threads,
1093
+ )
1094
+ return gradient_out, proba_out
1095
+
1096
+
1097
+ class ExponentialLoss(BaseLoss):
1098
+ """Exponential loss with (half) logit link, for binary classification.
1099
+
1100
+ This is also know as boosting loss.
1101
+
1102
+ Domain:
1103
+ y_true in [0, 1], i.e. regression on the unit interval
1104
+ y_pred in (0, 1), i.e. boundaries excluded
1105
+
1106
+ Link:
1107
+ y_pred = expit(2 * raw_prediction)
1108
+
1109
+ For a given sample x_i, the exponential loss is defined as::
1110
+
1111
+ loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)
1112
+
1113
+ See:
1114
+ - J. Friedman, T. Hastie, R. Tibshirani.
1115
+ "Additive logistic regression: a statistical view of boosting (With discussion
1116
+ and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
1117
+ https://doi.org/10.1214/aos/1016218223
1118
+ - A. Buja, W. Stuetzle, Y. Shen. (2005).
1119
+ "Loss Functions for Binary Class Probability Estimation and Classification:
1120
+ Structure and Applications."
1121
+
1122
+ Note that the formulation works for classification, y = {0, 1}, as well as
1123
+ "exponential logistic" regression, y = [0, 1].
1124
+ Note that this is a proper scoring rule, but without it's canonical link.
1125
+
1126
+ More details: Inserting the predicted probability
1127
+ y_pred = expit(2 * raw_prediction) in the loss gives::
1128
+
1129
+ loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
1130
+ + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
1131
+ """
1132
+
1133
+ def __init__(self, sample_weight=None):
1134
+ super().__init__(
1135
+ closs=CyExponentialLoss(),
1136
+ link=HalfLogitLink(),
1137
+ n_classes=2,
1138
+ )
1139
+ self.interval_y_true = Interval(0, 1, True, True)
1140
+
1141
+ def constant_to_optimal_zero(self, y_true, sample_weight=None):
1142
+ # This is non-zero only if y_true is neither 0 nor 1.
1143
+ term = -2 * np.sqrt(y_true * (1 - y_true))
1144
+ if sample_weight is not None:
1145
+ term *= sample_weight
1146
+ return term
1147
+
1148
+ def predict_proba(self, raw_prediction):
1149
+ """Predict probabilities.
1150
+
1151
+ Parameters
1152
+ ----------
1153
+ raw_prediction : array of shape (n_samples,) or (n_samples, 1)
1154
+ Raw prediction values (in link space).
1155
+
1156
+ Returns
1157
+ -------
1158
+ proba : array of shape (n_samples, 2)
1159
+ Element-wise class probabilities.
1160
+ """
1161
+ # Be graceful to shape (n_samples, 1) -> (n_samples,)
1162
+ if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
1163
+ raw_prediction = raw_prediction.squeeze(1)
1164
+ proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
1165
+ proba[:, 1] = self.link.inverse(raw_prediction)
1166
+ proba[:, 0] = 1 - proba[:, 1]
1167
+ return proba
1168
+
1169
+
1170
+ _LOSSES = {
1171
+ "squared_error": HalfSquaredError,
1172
+ "absolute_error": AbsoluteError,
1173
+ "pinball_loss": PinballLoss,
1174
+ "huber_loss": HuberLoss,
1175
+ "poisson_loss": HalfPoissonLoss,
1176
+ "gamma_loss": HalfGammaLoss,
1177
+ "tweedie_loss": HalfTweedieLoss,
1178
+ "binomial_loss": HalfBinomialLoss,
1179
+ "multinomial_loss": HalfMultinomialLoss,
1180
+ "exponential_loss": ExponentialLoss,
1181
+ }
.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .pyx is generated, so this is needed to make Cython compilation work
2
+ _loss_cython_tree = [
3
+ fs.copyfile('_loss.pxd')
4
+ ]
5
+
6
+ _loss_pyx = custom_target(
7
+ '_loss_pyx',
8
+ output: '_loss.pyx',
9
+ input: '_loss.pyx.tp',
10
+ command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
11
+ # TODO in principle this should go in py.exension_module below. This is
12
+ # temporary work-around for dependency issue with .pyx.tp files. For more
13
+ # details, see https://github.com/mesonbuild/meson/issues/13212
14
+ depends: _loss_cython_tree,
15
+ )
16
+
17
+ py.extension_module(
18
+ '_loss',
19
+ cython_gen.process(_loss_pyx),
20
+ dependencies: [openmp_dep],
21
+ install: true,
22
+ subdir: 'sklearn/_loss',
23
+ )