diff --git a/.gitattributes b/.gitattributes index 1943cbaac98cbd9242194c83ee511d0b167494a7..4cd5cc6d7d9eab55bf0fdc4ee3d60c3da4d53cbc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -264,3 +264,7 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_ .venv/lib/python3.11/site-packages/numpy.libs/libquadmath-96973f99.so.0.0.0 filter=lfs diff=lfs merge=lfs -text .venv/lib/python3.11/site-packages/numpy.libs/libgfortran-040039e1.so.5.0.0 filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/stats/__pycache__/stochastic_process_types.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/libcurand.so.10 filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops_infer.so.8 filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublas.so.11 filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_cnn_train.so.8 filter=lfs diff=lfs merge=lfs -text diff --git a/llm_tutorial/merged_models/en-ja-base_0.33+instruct/special_tokens_map.json b/llm_tutorial/merged_models/en-ja-base_0.33+instruct/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..51d5c15b521aa1999f68b840245a10ca9b8fe8a0 --- /dev/null +++ b/llm_tutorial/merged_models/en-ja-base_0.33+instruct/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/llm_tutorial/merged_models/en-ja-base_0.33/model.safetensors.index.json b/llm_tutorial/merged_models/en-ja-base_0.33/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..278c4d734d280d09567f7444303b0bd6712a70e9 --- /dev/null +++ b/llm_tutorial/merged_models/en-ja-base_0.33/model.safetensors.index.json @@ -0,0 +1 @@ +{"metadata": {"mergekit_version": "0.0.5.1", "total_size": 7565826048}, "weight_map": {"lm_head.weight": "model-00001-of-00002.safetensors", "model.embed_tokens.weight": "model-00001-of-00002.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.norm.weight": "model-00002-of-00002.safetensors"}} \ No newline at end of file diff --git a/llm_tutorial/merged_models/en-ja-base_dare-linear/model.safetensors.index.json b/llm_tutorial/merged_models/en-ja-base_dare-linear/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..278c4d734d280d09567f7444303b0bd6712a70e9 --- /dev/null +++ b/llm_tutorial/merged_models/en-ja-base_dare-linear/model.safetensors.index.json @@ -0,0 +1 @@ +{"metadata": {"mergekit_version": "0.0.5.1", "total_size": 7565826048}, "weight_map": {"lm_head.weight": "model-00001-of-00002.safetensors", "model.embed_tokens.weight": "model-00001-of-00002.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.norm.weight": "model-00002-of-00002.safetensors"}} \ No newline at end of file diff --git a/llm_tutorial/merged_models4/ja-en_en-ja_ties02/tokenizer_config.json b/llm_tutorial/merged_models4/ja-en_en-ja_ties02/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..09aa8575abe71b38280c766ae331a8b70d68622f --- /dev/null +++ b/llm_tutorial/merged_models4/ja-en_en-ja_ties02/tokenizer_config.json @@ -0,0 +1,18 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "unk_token": "", + "bos_token": "", + "eos_token": "", + "pad_token": "", + "cls_token": "", + "sep_token": "", + "eod_token": "", + "mask_token": "", + "extra_ids": 0, + "sp_model_kwargs": {}, + "model_max_length": 1000000000000000019884624838656, + "clean_up_tokenization_spaces": false, + "special_tokens_map_file": null, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/llm_tutorial/merged_models4/ja-en_en-ja_ties03-instruct/model.safetensors.index.json b/llm_tutorial/merged_models4/ja-en_en-ja_ties03-instruct/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..278c4d734d280d09567f7444303b0bd6712a70e9 --- /dev/null +++ b/llm_tutorial/merged_models4/ja-en_en-ja_ties03-instruct/model.safetensors.index.json @@ -0,0 +1 @@ +{"metadata": {"mergekit_version": "0.0.5.1", "total_size": 7565826048}, "weight_map": {"lm_head.weight": "model-00001-of-00002.safetensors", "model.embed_tokens.weight": "model-00001-of-00002.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.input_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", "model.norm.weight": "model-00002-of-00002.safetensors"}} \ No newline at end of file diff --git a/llm_tutorial/merged_models4/ja-en_en-ja_ties03/special_tokens_map.json b/llm_tutorial/merged_models4/ja-en_en-ja_ties03/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..8644c8f41e420e9707cea943b96e19b36748fcff --- /dev/null +++ b/llm_tutorial/merged_models4/ja-en_en-ja_ties03/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": "", + "cls_token": "", + "eod_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/__pycache__/_native.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/__pycache__/_native.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1dbd503f22aec63697c1f2d57c878537a754ed16 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/__pycache__/_native.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/_native.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/_native.py new file mode 100644 index 0000000000000000000000000000000000000000..8117b2716d110074d9a81365c59343e81396b7f5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/_native.py @@ -0,0 +1,63 @@ +import typing as t + +from . import Markup + + +def escape(s: t.Any) -> Markup: + """Replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` in + the string with HTML-safe sequences. Use this if you need to display + text that might contain such characters in HTML. + + If the object has an ``__html__`` method, it is called and the + return value is assumed to already be safe for HTML. + + :param s: An object to be converted to a string and escaped. + :return: A :class:`Markup` string with the escaped text. + """ + if hasattr(s, "__html__"): + return Markup(s.__html__()) + + return Markup( + str(s) + .replace("&", "&") + .replace(">", ">") + .replace("<", "<") + .replace("'", "'") + .replace('"', """) + ) + + +def escape_silent(s: t.Optional[t.Any]) -> Markup: + """Like :func:`escape` but treats ``None`` as the empty string. + Useful with optional values, as otherwise you get the string + ``'None'`` when the value is ``None``. + + >>> escape(None) + Markup('None') + >>> escape_silent(None) + Markup('') + """ + if s is None: + return Markup() + + return escape(s) + + +def soft_str(s: t.Any) -> str: + """Convert an object to a string if it isn't already. This preserves + a :class:`Markup` string rather than converting it back to a basic + string, so it will still be marked as safe and won't be escaped + again. + + >>> value = escape("") + >>> value + Markup('<User 1>') + >>> escape(str(value)) + Markup('&lt;User 1&gt;') + >>> escape(soft_str(value)) + Markup('<User 1>') + """ + if not isinstance(s, str): + return str(s) + + return s diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/_speedups.pyi b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/_speedups.pyi new file mode 100644 index 0000000000000000000000000000000000000000..f673240f6d299917d829a5fdbf85d25d84dd0a72 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/markupsafe/_speedups.pyi @@ -0,0 +1,9 @@ +from typing import Any +from typing import Optional + +from . import Markup + +def escape(s: Any) -> Markup: ... +def escape_silent(s: Optional[Any]) -> Markup: ... +def soft_str(s: Any) -> str: ... +def soft_unicode(s: Any) -> str: ... diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/LICENSE b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9ecdc7586d08805bc984539f6672476e86e538b6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2005-2021 Fredrik Johansson and mpmath contributors + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/METADATA b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/METADATA new file mode 100644 index 0000000000000000000000000000000000000000..994b48acdba5cd0fdfb28cd1fbb0a84ebf81cba5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/METADATA @@ -0,0 +1,233 @@ +Metadata-Version: 2.1 +Name: mpmath +Version: 1.3.0 +Summary: Python library for arbitrary-precision floating-point arithmetic +Home-page: http://mpmath.org/ +Author: Fredrik Johansson +Author-email: fredrik.johansson@gmail.com +License: BSD +Project-URL: Source, https://github.com/fredrik-johansson/mpmath +Project-URL: Tracker, https://github.com/fredrik-johansson/mpmath/issues +Project-URL: Documentation, http://mpmath.org/doc/current/ +Classifier: License :: OSI Approved :: BSD License +Classifier: Topic :: Scientific/Engineering :: Mathematics +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Programming Language :: Python :: Implementation :: PyPy +License-File: LICENSE +Provides-Extra: develop +Requires-Dist: pytest (>=4.6) ; extra == 'develop' +Requires-Dist: pycodestyle ; extra == 'develop' +Requires-Dist: pytest-cov ; extra == 'develop' +Requires-Dist: codecov ; extra == 'develop' +Requires-Dist: wheel ; extra == 'develop' +Provides-Extra: docs +Requires-Dist: sphinx ; extra == 'docs' +Provides-Extra: gmpy +Requires-Dist: gmpy2 (>=2.1.0a4) ; (platform_python_implementation != "PyPy") and extra == 'gmpy' +Provides-Extra: tests +Requires-Dist: pytest (>=4.6) ; extra == 'tests' + +mpmath +====== + +|pypi version| |Build status| |Code coverage status| |Zenodo Badge| + +.. |pypi version| image:: https://img.shields.io/pypi/v/mpmath.svg + :target: https://pypi.python.org/pypi/mpmath +.. |Build status| image:: https://github.com/fredrik-johansson/mpmath/workflows/test/badge.svg + :target: https://github.com/fredrik-johansson/mpmath/actions?workflow=test +.. |Code coverage status| image:: https://codecov.io/gh/fredrik-johansson/mpmath/branch/master/graph/badge.svg + :target: https://codecov.io/gh/fredrik-johansson/mpmath +.. |Zenodo Badge| image:: https://zenodo.org/badge/2934512.svg + :target: https://zenodo.org/badge/latestdoi/2934512 + +A Python library for arbitrary-precision floating-point arithmetic. + +Website: http://mpmath.org/ +Main author: Fredrik Johansson + +Mpmath is free software released under the New BSD License (see the +LICENSE file for details) + +0. History and credits +---------------------- + +The following people (among others) have contributed major patches +or new features to mpmath: + +* Pearu Peterson +* Mario Pernici +* Ondrej Certik +* Vinzent Steinberg +* Nimish Telang +* Mike Taschuk +* Case Van Horsen +* Jorn Baayen +* Chris Smith +* Juan Arias de Reyna +* Ioannis Tziakos +* Aaron Meurer +* Stefan Krastanov +* Ken Allen +* Timo Hartmann +* Sergey B Kirpichev +* Kris Kuhlman +* Paul Masson +* Michael Kagalenko +* Jonathan Warner +* Max Gaukler +* Guillermo Navas-Palencia +* Nike Dattani + +Numerous other people have contributed by reporting bugs, +requesting new features, or suggesting improvements to the +documentation. + +For a detailed changelog, including individual contributions, +see the CHANGES file. + +Fredrik's work on mpmath during summer 2008 was sponsored by Google +as part of the Google Summer of Code program. + +Fredrik's work on mpmath during summer 2009 was sponsored by the +American Institute of Mathematics under the support of the National Science +Foundation Grant No. 0757627 (FRG: L-functions and Modular Forms). + +Any opinions, findings, and conclusions or recommendations expressed in this +material are those of the author(s) and do not necessarily reflect the +views of the sponsors. + +Credit also goes to: + +* The authors of the GMP library and the Python wrapper + gmpy, enabling mpmath to become much faster at + high precision +* The authors of MPFR, pari/gp, MPFUN, and other arbitrary- + precision libraries, whose documentation has been helpful + for implementing many of the algorithms in mpmath +* Wikipedia contributors; Abramowitz & Stegun; Gradshteyn & Ryzhik; + Wolfram Research for MathWorld and the Wolfram Functions site. + These are the main references used for special functions + implementations. +* George Brandl for developing the Sphinx documentation tool + used to build mpmath's documentation + +Release history: + +* Version 1.3.0 released on March 7, 2023 +* Version 1.2.0 released on February 1, 2021 +* Version 1.1.0 released on December 11, 2018 +* Version 1.0.0 released on September 27, 2017 +* Version 0.19 released on June 10, 2014 +* Version 0.18 released on December 31, 2013 +* Version 0.17 released on February 1, 2011 +* Version 0.16 released on September 24, 2010 +* Version 0.15 released on June 6, 2010 +* Version 0.14 released on February 5, 2010 +* Version 0.13 released on August 13, 2009 +* Version 0.12 released on June 9, 2009 +* Version 0.11 released on January 26, 2009 +* Version 0.10 released on October 15, 2008 +* Version 0.9 released on August 23, 2008 +* Version 0.8 released on April 20, 2008 +* Version 0.7 released on March 12, 2008 +* Version 0.6 released on January 13, 2008 +* Version 0.5 released on November 24, 2007 +* Version 0.4 released on November 3, 2007 +* Version 0.3 released on October 5, 2007 +* Version 0.2 released on October 2, 2007 +* Version 0.1 released on September 27, 2007 + +1. Download & installation +-------------------------- + +Mpmath requires Python 2.7 or 3.5 (or later versions). It has been tested +with CPython 2.7, 3.5 through 3.7 and for PyPy. + +The latest release of mpmath can be downloaded from the mpmath +website and from https://github.com/fredrik-johansson/mpmath/releases + +It should also be available in the Python Package Index at +https://pypi.python.org/pypi/mpmath + +To install latest release of Mpmath with pip, simply run + +``pip install mpmath`` + +Or unpack the mpmath archive and run + +``python setup.py install`` + +Mpmath can also be installed using + +``python -m easy_install mpmath`` + +The latest development code is available from +https://github.com/fredrik-johansson/mpmath + +See the main documentation for more detailed instructions. + +2. Running tests +---------------- + +The unit tests in mpmath/tests/ can be run via the script +runtests.py, but it is recommended to run them with py.test +(https://pytest.org/), especially +to generate more useful reports in case there are failures. + +You may also want to check out the demo scripts in the demo +directory. + +The master branch is automatically tested by Travis CI. + +3. Documentation +---------------- + +Documentation in reStructuredText format is available in the +doc directory included with the source package. These files +are human-readable, but can be compiled to prettier HTML using +the build.py script (requires Sphinx, http://sphinx.pocoo.org/). + +See setup.txt in the documentation for more information. + +The most recent documentation is also available in HTML format: + +http://mpmath.org/doc/current/ + +4. Known problems +----------------- + +Mpmath is a work in progress. Major issues include: + +* Some functions may return incorrect values when given extremely + large arguments or arguments very close to singularities. + +* Directed rounding works for arithmetic operations. It is implemented + heuristically for other operations, and their results may be off by one + or two units in the last place (even if otherwise accurate). + +* Some IEEE 754 features are not available. Inifinities and NaN are + partially supported; denormal rounding is currently not available + at all. + +* The interface for switching precision and rounding is not finalized. + The current method is not threadsafe. + +5. Help and bug reports +----------------------- + +General questions and comments can be sent to the mpmath mailinglist, +mpmath@googlegroups.com + +You can also report bugs and send patches to the mpmath issue tracker, +https://github.com/fredrik-johansson/mpmath/issues diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/RECORD b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..8af4834111971d7817e7f02177e4662e04e12f0e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/RECORD @@ -0,0 +1,180 @@ +mpmath-1.3.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 +mpmath-1.3.0.dist-info/LICENSE,sha256=wmyugdpFCOXiSZhXd6M4IfGDIj67dNf4z7-Q_n7vL7c,1537 +mpmath-1.3.0.dist-info/METADATA,sha256=RLZupES5wNGa6UgV01a_BHrmtoDBkmi1wmVofNaoFAY,8630 +mpmath-1.3.0.dist-info/RECORD,, +mpmath-1.3.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92 +mpmath-1.3.0.dist-info/top_level.txt,sha256=BUVWrh8EVlkOhM1n3X9S8msTaVcC-3s6Sjt60avHYus,7 +mpmath/__init__.py,sha256=skFYTSwfwDBLChAV6pI3SdewgAQR3UBtyrfIK_Jdn-g,8765 +mpmath/__pycache__/__init__.cpython-311.pyc,, +mpmath/__pycache__/ctx_base.cpython-311.pyc,, +mpmath/__pycache__/ctx_fp.cpython-311.pyc,, +mpmath/__pycache__/ctx_iv.cpython-311.pyc,, +mpmath/__pycache__/ctx_mp.cpython-311.pyc,, +mpmath/__pycache__/ctx_mp_python.cpython-311.pyc,, +mpmath/__pycache__/function_docs.cpython-311.pyc,, +mpmath/__pycache__/identification.cpython-311.pyc,, +mpmath/__pycache__/math2.cpython-311.pyc,, +mpmath/__pycache__/rational.cpython-311.pyc,, +mpmath/__pycache__/usertools.cpython-311.pyc,, +mpmath/__pycache__/visualization.cpython-311.pyc,, +mpmath/calculus/__init__.py,sha256=UAgCIJ1YmaeyTqpNzjBlCZGeIzLtUZMEEpl99VWNjus,162 +mpmath/calculus/__pycache__/__init__.cpython-311.pyc,, +mpmath/calculus/__pycache__/approximation.cpython-311.pyc,, +mpmath/calculus/__pycache__/calculus.cpython-311.pyc,, +mpmath/calculus/__pycache__/differentiation.cpython-311.pyc,, +mpmath/calculus/__pycache__/extrapolation.cpython-311.pyc,, +mpmath/calculus/__pycache__/inverselaplace.cpython-311.pyc,, +mpmath/calculus/__pycache__/odes.cpython-311.pyc,, +mpmath/calculus/__pycache__/optimization.cpython-311.pyc,, +mpmath/calculus/__pycache__/polynomials.cpython-311.pyc,, +mpmath/calculus/__pycache__/quadrature.cpython-311.pyc,, +mpmath/calculus/approximation.py,sha256=vyzu3YI6r63Oq1KFHrQz02mGXAcH23emqNYhJuUaFZ4,8817 +mpmath/calculus/calculus.py,sha256=A0gSp0hxSyEDfugJViY3CeWalF-vK701YftzrjSQzQ4,112 +mpmath/calculus/differentiation.py,sha256=2L6CBj8xtX9iip98NPbKsLtwtRjxi571wYmTMHFeL90,20226 +mpmath/calculus/extrapolation.py,sha256=xM0rvk2DFEF4iR1Jhl-Y3aS93iW9VVJX7y9IGpmzC-A,73306 +mpmath/calculus/inverselaplace.py,sha256=5-pn8N_t0PtgBTXixsXZ4xxrihK2J5gYsVfTKfDx4gA,36056 +mpmath/calculus/odes.py,sha256=gaHiw7IJjsONNTAa6izFPZpmcg9uyTp8MULnGdzTIGo,9908 +mpmath/calculus/optimization.py,sha256=bKnShXElBOmVOIOlFeksDsYCp9fYSmYwKmXDt0z26MM,32856 +mpmath/calculus/polynomials.py,sha256=D16BhU_SHbVi06IxNwABHR-H77IylndNsN3muPTuFYs,7877 +mpmath/calculus/quadrature.py,sha256=n-avtS8E43foV-5tr5lofgOBaiMUYE8AJjQcWI9QcKk,42432 +mpmath/ctx_base.py,sha256=rfjmfMyA55x8R_cWFINUwWVTElfZmyx5erKDdauSEVw,15985 +mpmath/ctx_fp.py,sha256=ctUjx_NoU0iFWk05cXDYCL2ZtLZOlWs1n6Zao3pbG2g,6572 +mpmath/ctx_iv.py,sha256=tqdMr-GDfkZk1EhoGeCAajy7pQv-RWtrVqhYjfI8r4g,17211 +mpmath/ctx_mp.py,sha256=d3r4t7xHNqSFtmqsA9Btq1Npy3WTM-pcM2_jeCyECxY,49452 +mpmath/ctx_mp_python.py,sha256=3olYWo4lk1SnQ0A_IaZ181qqG8u5pxGat_v-L4Qtn3Y,37815 +mpmath/function_docs.py,sha256=g4PP8n6ILXmHcLyA50sxK6Tmp_Z4_pRN-wDErU8D1i4,283512 +mpmath/functions/__init__.py,sha256=YXVdhqv-6LKm6cr5xxtTNTtuD9zDPKGQl8GmS0xz2xo,330 +mpmath/functions/__pycache__/__init__.cpython-311.pyc,, +mpmath/functions/__pycache__/bessel.cpython-311.pyc,, +mpmath/functions/__pycache__/elliptic.cpython-311.pyc,, +mpmath/functions/__pycache__/expintegrals.cpython-311.pyc,, +mpmath/functions/__pycache__/factorials.cpython-311.pyc,, +mpmath/functions/__pycache__/functions.cpython-311.pyc,, +mpmath/functions/__pycache__/hypergeometric.cpython-311.pyc,, +mpmath/functions/__pycache__/orthogonal.cpython-311.pyc,, +mpmath/functions/__pycache__/qfunctions.cpython-311.pyc,, +mpmath/functions/__pycache__/rszeta.cpython-311.pyc,, +mpmath/functions/__pycache__/signals.cpython-311.pyc,, +mpmath/functions/__pycache__/theta.cpython-311.pyc,, +mpmath/functions/__pycache__/zeta.cpython-311.pyc,, +mpmath/functions/__pycache__/zetazeros.cpython-311.pyc,, +mpmath/functions/bessel.py,sha256=dUPLu8frlK-vmf3-irX_7uvwyw4xccv6EIizmIZ88kM,37938 +mpmath/functions/elliptic.py,sha256=qz0yVMb4lWEeOTDL_DWz5u5awmGIPKAsuZFJXgwHJNU,42237 +mpmath/functions/expintegrals.py,sha256=75X_MRdYc1F_X73bgNiOJqwRlS2hqAzcFLl3RM2tCDc,11644 +mpmath/functions/factorials.py,sha256=8_6kCR7e4k1GwxiAOJu0NRadeF4jA28qx4hidhu4ILk,5273 +mpmath/functions/functions.py,sha256=ub2JExvqzCWLkm5yAm72Fr6fdWmZZUknq9_3w9MEigI,18100 +mpmath/functions/hypergeometric.py,sha256=Z0OMAMC4ylK42n_SnamyFVnUx6zHLyCLCoJDSZ1JrHY,51570 +mpmath/functions/orthogonal.py,sha256=FabkxKfBoSseA5flWu1a3re-2BYaew9augqIsT8LaLw,16097 +mpmath/functions/qfunctions.py,sha256=a3EHGKQt_jMd4x9I772Jz-TGFnGY-arWqPvZGz9QSe0,7633 +mpmath/functions/rszeta.py,sha256=yuUVp4ilIyDmXyE3WTBxDDjwfEJNypJnbPS-xPH5How,46184 +mpmath/functions/signals.py,sha256=ELotwQaW1CDpv-eeJzOZ5c23NhfaZcj9_Gkb3psvS0Q,703 +mpmath/functions/theta.py,sha256=KggOocczoMG6_HMoal4oEP7iZ4SKOou9JFE-WzY2r3M,37320 +mpmath/functions/zeta.py,sha256=ue7JY7GXA0oX8q08sQJl2CSRrZ7kOt8HsftpVjnTwrE,36410 +mpmath/functions/zetazeros.py,sha256=uq6TVyZBcY2MLX7VSdVfn0TOkowBLM9fXtnySEwaNzw,30858 +mpmath/identification.py,sha256=7aMdngRAaeL_MafDUNbmEIlGQSklHDZ8pmPFt-OLgkw,29253 +mpmath/libmp/__init__.py,sha256=UCDjLZw4brbklaCmSixCcPdLdHkz8sF_-6F_wr0duAg,3790 +mpmath/libmp/__pycache__/__init__.cpython-311.pyc,, +mpmath/libmp/__pycache__/backend.cpython-311.pyc,, +mpmath/libmp/__pycache__/gammazeta.cpython-311.pyc,, +mpmath/libmp/__pycache__/libelefun.cpython-311.pyc,, +mpmath/libmp/__pycache__/libhyper.cpython-311.pyc,, +mpmath/libmp/__pycache__/libintmath.cpython-311.pyc,, +mpmath/libmp/__pycache__/libmpc.cpython-311.pyc,, +mpmath/libmp/__pycache__/libmpf.cpython-311.pyc,, +mpmath/libmp/__pycache__/libmpi.cpython-311.pyc,, +mpmath/libmp/backend.py,sha256=26A8pUkaGov26vrrFNQVyWJ5LDtK8sl3UHrYLecaTjA,3360 +mpmath/libmp/gammazeta.py,sha256=Xqdw6PMoswDaSca_sOs-IglRuk3fb8c9p43M_lbcrlc,71469 +mpmath/libmp/libelefun.py,sha256=joBZP4FOdxPfieWso1LPtSr6dHydpG_LQiF_bYQYWMg,43861 +mpmath/libmp/libhyper.py,sha256=J9fmdDF6u27EcssEWvBuVaAa3hFjPvPN1SgRgu1dEbc,36624 +mpmath/libmp/libintmath.py,sha256=aIRT0rkUZ_sdGQf3TNCLd-pBMvtQWjssbvFLfK7U0jc,16688 +mpmath/libmp/libmpc.py,sha256=KBndUjs5YVS32-Id3fflDfYgpdW1Prx6zfo8Ez5Qbrs,26875 +mpmath/libmp/libmpf.py,sha256=vpP0kNVkScbCVoZogJ4Watl4I7Ce0d4dzHVjfVe57so,45021 +mpmath/libmp/libmpi.py,sha256=u0I5Eiwkqa-4-dXETi5k7MuaxBeZbvCAPFtl93U9YF0,27622 +mpmath/math2.py,sha256=O5Dglg81SsW0wfHDUJcXOD8-cCaLvbVIvyw0sVmRbpI,18561 +mpmath/matrices/__init__.py,sha256=ETzGDciYbq9ftiKwaMbJ15EI-KNXHrzRb-ZHehhqFjs,94 +mpmath/matrices/__pycache__/__init__.cpython-311.pyc,, +mpmath/matrices/__pycache__/calculus.cpython-311.pyc,, +mpmath/matrices/__pycache__/eigen.cpython-311.pyc,, +mpmath/matrices/__pycache__/eigen_symmetric.cpython-311.pyc,, +mpmath/matrices/__pycache__/linalg.cpython-311.pyc,, +mpmath/matrices/__pycache__/matrices.cpython-311.pyc,, +mpmath/matrices/calculus.py,sha256=PNRq-p2nxgT-fzC54K2depi8ddhdx6Q86G8qpUiHeUY,18609 +mpmath/matrices/eigen.py,sha256=GbDXI3CixzEdXxr1G86uUWkAngAvd-05MmSQ-Tsu_5k,24394 +mpmath/matrices/eigen_symmetric.py,sha256=FPKPeQr1cGYw6Y6ea32a1YdEWQDLP6JlQHEA2WfNLYg,58534 +mpmath/matrices/linalg.py,sha256=04C3ijzMFom7ob5fXBCDfyPPdo3BIboIeE8x2A6vqF0,26958 +mpmath/matrices/matrices.py,sha256=o78Eq62EHQnxcsR0LBoWDEGREOoN4L2iDM1q3dQrw0o,32331 +mpmath/rational.py,sha256=64d56fvZXngYZT7nOAHeFRUX77eJ1A0R3rpfWBU-mSo,5976 +mpmath/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +mpmath/tests/__pycache__/__init__.cpython-311.pyc,, +mpmath/tests/__pycache__/extratest_gamma.cpython-311.pyc,, +mpmath/tests/__pycache__/extratest_zeta.cpython-311.pyc,, +mpmath/tests/__pycache__/runtests.cpython-311.pyc,, +mpmath/tests/__pycache__/test_basic_ops.cpython-311.pyc,, +mpmath/tests/__pycache__/test_bitwise.cpython-311.pyc,, +mpmath/tests/__pycache__/test_calculus.cpython-311.pyc,, +mpmath/tests/__pycache__/test_compatibility.cpython-311.pyc,, +mpmath/tests/__pycache__/test_convert.cpython-311.pyc,, +mpmath/tests/__pycache__/test_diff.cpython-311.pyc,, +mpmath/tests/__pycache__/test_division.cpython-311.pyc,, +mpmath/tests/__pycache__/test_eigen.cpython-311.pyc,, +mpmath/tests/__pycache__/test_eigen_symmetric.cpython-311.pyc,, +mpmath/tests/__pycache__/test_elliptic.cpython-311.pyc,, +mpmath/tests/__pycache__/test_fp.cpython-311.pyc,, +mpmath/tests/__pycache__/test_functions.cpython-311.pyc,, +mpmath/tests/__pycache__/test_functions2.cpython-311.pyc,, +mpmath/tests/__pycache__/test_gammazeta.cpython-311.pyc,, +mpmath/tests/__pycache__/test_hp.cpython-311.pyc,, +mpmath/tests/__pycache__/test_identify.cpython-311.pyc,, +mpmath/tests/__pycache__/test_interval.cpython-311.pyc,, +mpmath/tests/__pycache__/test_levin.cpython-311.pyc,, +mpmath/tests/__pycache__/test_linalg.cpython-311.pyc,, +mpmath/tests/__pycache__/test_matrices.cpython-311.pyc,, +mpmath/tests/__pycache__/test_mpmath.cpython-311.pyc,, +mpmath/tests/__pycache__/test_ode.cpython-311.pyc,, +mpmath/tests/__pycache__/test_pickle.cpython-311.pyc,, +mpmath/tests/__pycache__/test_power.cpython-311.pyc,, +mpmath/tests/__pycache__/test_quad.cpython-311.pyc,, +mpmath/tests/__pycache__/test_rootfinding.cpython-311.pyc,, +mpmath/tests/__pycache__/test_special.cpython-311.pyc,, +mpmath/tests/__pycache__/test_str.cpython-311.pyc,, +mpmath/tests/__pycache__/test_summation.cpython-311.pyc,, +mpmath/tests/__pycache__/test_trig.cpython-311.pyc,, +mpmath/tests/__pycache__/test_visualization.cpython-311.pyc,, +mpmath/tests/__pycache__/torture.cpython-311.pyc,, +mpmath/tests/extratest_gamma.py,sha256=xidhXUelILcxtiPGoTBHjqUOKIJzEaZ_v3nntGQyWZQ,7228 +mpmath/tests/extratest_zeta.py,sha256=sg10j9RhjBpV2EdUqyYhGV2ERWvM--EvwwGIz6HTmlw,1003 +mpmath/tests/runtests.py,sha256=7NUV82F3K_5AhU8mCLUFf5OibtT7uloFCwPyM3l71wM,5189 +mpmath/tests/test_basic_ops.py,sha256=dsB8DRG-GrPzBaZ-bIauYabaeqXbfqBo9SIP9BqcTSs,15348 +mpmath/tests/test_bitwise.py,sha256=-nLYhgQbhDza3SQM63BhktYntACagqMYx9ib3dPnTKM,7686 +mpmath/tests/test_calculus.py,sha256=4oxtNfMpO4RLLoOzrv7r9-h8BcqfBsJIE6UpsHe7c4w,9187 +mpmath/tests/test_compatibility.py,sha256=_t3ASZ3jhfAMnN1voWX7PDNIDzn-3PokkJGIdT1x7y0,2306 +mpmath/tests/test_convert.py,sha256=JPcDcTJIWh5prIxjx5DM1aNWgqlUoF2KpHvAgK3uHi4,8834 +mpmath/tests/test_diff.py,sha256=qjiF8NxQ8vueuZ5ZHGPQ-kjcj_I7Jh_fEdFtaA8DzEI,2466 +mpmath/tests/test_division.py,sha256=6lUeZfmaBWvvszdqlWLMHgXPjVsxvW1WZpd4-jFWCpU,5340 +mpmath/tests/test_eigen.py,sha256=2mnqVATGbsJkvSVHPpitfAk881twFfb3LsO3XikV9Hs,3905 +mpmath/tests/test_eigen_symmetric.py,sha256=v0VimCicIU2owASDMBaP-t-30uq-pXcsglt95KBtNO4,8778 +mpmath/tests/test_elliptic.py,sha256=Kjiwq9Bb6N_OOzzWewGQ1M_PMa7vRs42V0t90gloZxo,26225 +mpmath/tests/test_fp.py,sha256=AJo0FTyH4BuUnUsv176LD956om308KGYndy-b54KGxM,89997 +mpmath/tests/test_functions.py,sha256=b47VywdomoOX6KmMmz9-iv2IqVIydwKSuUw2pWlFHrY,30955 +mpmath/tests/test_functions2.py,sha256=vlw2RWhL1oTcifnOMDx1a_YzN96UgNNIE5STeKRv1HY,96990 +mpmath/tests/test_gammazeta.py,sha256=AB34O0DV7AlEf9Z4brnCadeQU5-uAwhWRw5FZas65DA,27917 +mpmath/tests/test_hp.py,sha256=6hcENu6Te2klPEiTSeLBIRPlH7PADlJwFKbx8xpnOhg,10461 +mpmath/tests/test_identify.py,sha256=lGUIPfrB2paTg0cFUo64GmMzF77F9gs9FQjX7gxGHV8,692 +mpmath/tests/test_interval.py,sha256=TjYd7a9ca6iRJiLjw06isLeZTuGoGAPmgleDZ0cYfJ0,17527 +mpmath/tests/test_levin.py,sha256=P8M11yV1dj_gdSNv5xuwCzFiF86QyRDtPMjURy6wJ28,5090 +mpmath/tests/test_linalg.py,sha256=miKEnwB8iwWV13hi1bF1cg3hgB4rTKOR0fvDVfWmXds,10440 +mpmath/tests/test_matrices.py,sha256=qyA4Ml2CvNvW034lzB01G6wVgNr7UrgZqh2wkMXtpzM,7944 +mpmath/tests/test_mpmath.py,sha256=LVyJUeofiaxW-zLKWVBCz59L9UQsjlW0Ts9_oBiEv_4,196 +mpmath/tests/test_ode.py,sha256=zAxexBH4fnmFNO4bvEHbug1NJWC5zqfFaVDlYijowkY,1822 +mpmath/tests/test_pickle.py,sha256=Y8CKmDLFsJHUqG8CDaBw5ilrPP4YT1xijVduLpQ7XFE,401 +mpmath/tests/test_power.py,sha256=sz_K02SmNxpa6Kb1uJLN_N4tXTJGdQ___vPRshEN7Gk,5227 +mpmath/tests/test_quad.py,sha256=49Ltft0vZ_kdKLL5s-Kj-BzAVoF5LPVEUeNUzdOkghI,3893 +mpmath/tests/test_rootfinding.py,sha256=umQegEaKHmYOEl5jEyoD-VLKDtXsTJJkepKEr4c0dC0,3132 +mpmath/tests/test_special.py,sha256=YbMIoMIkJEvvKYIzS0CXthJFG0--j6un7-tcE6b7FPM,2848 +mpmath/tests/test_str.py,sha256=0WsGD9hMPRi8zcuYMA9Cu2mOvQiCFskPwMsMf8lBDK4,544 +mpmath/tests/test_summation.py,sha256=fdNlsvRVOsbWxbhlyDLDaEO2S8kTJrRMKIvB5-aNci0,2035 +mpmath/tests/test_trig.py,sha256=zPtkIEnZaThxcWur4k7BX8-2Jmj-AhO191Svv7ANYUU,4799 +mpmath/tests/test_visualization.py,sha256=1PqtkoUx-WsKYgTRiu5o9pBc85kwhf1lzU2eobDQCJM,944 +mpmath/tests/torture.py,sha256=LD95oES7JY2KroELK-m-jhvtbvZaKChnt0Cq7kFMNCw,7868 +mpmath/usertools.py,sha256=a-TDw7XSRsPdBEffxOooDV4WDFfuXnO58P75dcAD87I,3029 +mpmath/visualization.py,sha256=pnnbjcd9AhFVRBZavYX5gjx4ytK_kXoDDisYR6EpXhs,10627 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/WHEEL b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..57e3d840d59a650ac5bccbad5baeec47d155f0ad --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/mpmath-1.3.0.dist-info/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.38.4) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublas.so.11 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublas.so.11 new file mode 100644 index 0000000000000000000000000000000000000000..77d919755938eb4b12caefa30445fcb8e58f9350 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libcublas.so.11 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b81d170cd613cf9ee24d30b483f7b6d8170d6d32a0354fc207d09c943ae3f62 +size 94729912 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_cnn_train.so.8 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_cnn_train.so.8 new file mode 100644 index 0000000000000000000000000000000000000000..fbf1190cb9a9b2cdd9a735efa21554644380919a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_cnn_train.so.8 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:801895419e9de24de496099ffff3eef143e82c487cde68c4e9197e96fbea11db +size 102270568 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops_infer.so.8 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops_infer.so.8 new file mode 100644 index 0000000000000000000000000000000000000000..1fd29daa13eaafb12b26fe2741666d90c698f158 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn_ops_infer.so.8 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e69b36512b6999005de457763369d44743a80f9eb1416e09c4e6029e4ab380d +size 97560024 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/libcurand.so.10 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/libcurand.so.10 new file mode 100644 index 0000000000000000000000000000000000000000..fb862d071e604a088f55f4d94a3a2632570a0613 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/curand/lib/libcurand.so.10 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97813df916ebe8613efb8d57127b9119403332a64d929bfd172452c140e101de +size 101334448 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7048bd548cca04780479539339dd8cc49a21990f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__init__.py @@ -0,0 +1,8 @@ +from .products import product, Product +from .summations import summation, Sum + +__all__ = [ + 'product', 'Product', + + 'summation', 'Sum', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/expr_with_intlimits.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/expr_with_intlimits.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7ebc7564953b4b87e17fe31f763f83f969004d6 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/expr_with_intlimits.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/expr_with_limits.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/expr_with_limits.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66fe1886f0a92fec56655314bd8569f4de8f2c95 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/expr_with_limits.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/guess.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/guess.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aa7d1aa20cde2911cef9b83c767cdb366b8ad690 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/guess.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/products.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/products.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d62a508f7a15a81f5966102fc0191f9d5cc52129 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/products.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/summations.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/summations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a6c085d252e49a5508a5fbf97c41be20f0ad94a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/__pycache__/summations.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/gosper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/gosper.py new file mode 100644 index 0000000000000000000000000000000000000000..c50bb8051fc0171878b207a1c7680b3ff7e2b1e4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/gosper.py @@ -0,0 +1,227 @@ +"""Gosper's algorithm for hypergeometric summation. """ + +from sympy.core import S, Dummy, symbols +from sympy.polys import Poly, parallel_poly_from_expr, factor +from sympy.utilities.iterables import is_sequence + + +def gosper_normal(f, g, n, polys=True): + r""" + Compute the Gosper's normal form of ``f`` and ``g``. + + Explanation + =========== + + Given relatively prime univariate polynomials ``f`` and ``g``, + rewrite their quotient to a normal form defined as follows: + + .. math:: + \frac{f(n)}{g(n)} = Z \cdot \frac{A(n) C(n+1)}{B(n) C(n)} + + where ``Z`` is an arbitrary constant and ``A``, ``B``, ``C`` are + monic polynomials in ``n`` with the following properties: + + 1. `\gcd(A(n), B(n+h)) = 1 \forall h \in \mathbb{N}` + 2. `\gcd(B(n), C(n+1)) = 1` + 3. `\gcd(A(n), C(n)) = 1` + + This normal form, or rational factorization in other words, is a + crucial step in Gosper's algorithm and in solving of difference + equations. It can be also used to decide if two hypergeometric + terms are similar or not. + + This procedure will return a tuple containing elements of this + factorization in the form ``(Z*A, B, C)``. + + Examples + ======== + + >>> from sympy.concrete.gosper import gosper_normal + >>> from sympy.abc import n + + >>> gosper_normal(4*n+5, 2*(4*n+1)*(2*n+3), n, polys=False) + (1/4, n + 3/2, n + 1/4) + + """ + (p, q), opt = parallel_poly_from_expr( + (f, g), n, field=True, extension=True) + + a, A = p.LC(), p.monic() + b, B = q.LC(), q.monic() + + C, Z = A.one, a/b + h = Dummy('h') + + D = Poly(n + h, n, h, domain=opt.domain) + + R = A.resultant(B.compose(D)) + roots = set(R.ground_roots().keys()) + + for r in set(roots): + if not r.is_Integer or r < 0: + roots.remove(r) + + for i in sorted(roots): + d = A.gcd(B.shift(+i)) + + A = A.quo(d) + B = B.quo(d.shift(-i)) + + for j in range(1, i + 1): + C *= d.shift(-j) + + A = A.mul_ground(Z) + + if not polys: + A = A.as_expr() + B = B.as_expr() + C = C.as_expr() + + return A, B, C + + +def gosper_term(f, n): + r""" + Compute Gosper's hypergeometric term for ``f``. + + Explanation + =========== + + Suppose ``f`` is a hypergeometric term such that: + + .. math:: + s_n = \sum_{k=0}^{n-1} f_k + + and `f_k` does not depend on `n`. Returns a hypergeometric + term `g_n` such that `g_{n+1} - g_n = f_n`. + + Examples + ======== + + >>> from sympy.concrete.gosper import gosper_term + >>> from sympy import factorial + >>> from sympy.abc import n + + >>> gosper_term((4*n + 1)*factorial(n)/factorial(2*n + 1), n) + (-n - 1/2)/(n + 1/4) + + """ + from sympy.simplify import hypersimp + r = hypersimp(f, n) + + if r is None: + return None # 'f' is *not* a hypergeometric term + + p, q = r.as_numer_denom() + + A, B, C = gosper_normal(p, q, n) + B = B.shift(-1) + + N = S(A.degree()) + M = S(B.degree()) + K = S(C.degree()) + + if (N != M) or (A.LC() != B.LC()): + D = {K - max(N, M)} + elif not N: + D = {K - N + 1, S.Zero} + else: + D = {K - N + 1, (B.nth(N - 1) - A.nth(N - 1))/A.LC()} + + for d in set(D): + if not d.is_Integer or d < 0: + D.remove(d) + + if not D: + return None # 'f(n)' is *not* Gosper-summable + + d = max(D) + + coeffs = symbols('c:%s' % (d + 1), cls=Dummy) + domain = A.get_domain().inject(*coeffs) + + x = Poly(coeffs, n, domain=domain) + H = A*x.shift(1) - B*x - C + + from sympy.solvers.solvers import solve + solution = solve(H.coeffs(), coeffs) + + if solution is None: + return None # 'f(n)' is *not* Gosper-summable + + x = x.as_expr().subs(solution) + + for coeff in coeffs: + if coeff not in solution: + x = x.subs(coeff, 0) + + if x.is_zero: + return None # 'f(n)' is *not* Gosper-summable + else: + return B.as_expr()*x/C.as_expr() + + +def gosper_sum(f, k): + r""" + Gosper's hypergeometric summation algorithm. + + Explanation + =========== + + Given a hypergeometric term ``f`` such that: + + .. math :: + s_n = \sum_{k=0}^{n-1} f_k + + and `f(n)` does not depend on `n`, returns `g_{n} - g(0)` where + `g_{n+1} - g_n = f_n`, or ``None`` if `s_n` cannot be expressed + in closed form as a sum of hypergeometric terms. + + Examples + ======== + + >>> from sympy.concrete.gosper import gosper_sum + >>> from sympy import factorial + >>> from sympy.abc import n, k + + >>> f = (4*k + 1)*factorial(k)/factorial(2*k + 1) + >>> gosper_sum(f, (k, 0, n)) + (-factorial(n) + 2*factorial(2*n + 1))/factorial(2*n + 1) + >>> _.subs(n, 2) == sum(f.subs(k, i) for i in [0, 1, 2]) + True + >>> gosper_sum(f, (k, 3, n)) + (-60*factorial(n) + factorial(2*n + 1))/(60*factorial(2*n + 1)) + >>> _.subs(n, 5) == sum(f.subs(k, i) for i in [3, 4, 5]) + True + + References + ========== + + .. [1] Marko Petkovsek, Herbert S. Wilf, Doron Zeilberger, A = B, + AK Peters, Ltd., Wellesley, MA, USA, 1997, pp. 73--100 + + """ + indefinite = False + + if is_sequence(k): + k, a, b = k + else: + indefinite = True + + g = gosper_term(f, k) + + if g is None: + return None + + if indefinite: + result = f*g + else: + result = (f*(g + 1)).subs(k, b) - (f*g).subs(k, a) + + if result is S.NaN: + try: + result = (f*(g + 1)).limit(k, b) - (f*g).limit(k, a) + except NotImplementedError: + result = None + + return factor(result) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/guess.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/guess.py new file mode 100644 index 0000000000000000000000000000000000000000..f3974485d7cfe16ca2338797d00acf04c3aadfb6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/guess.py @@ -0,0 +1,473 @@ +"""Various algorithms for helping identifying numbers and sequences.""" + + +from sympy.concrete.products import (Product, product) +from sympy.core import Function, S +from sympy.core.add import Add +from sympy.core.numbers import Integer, Rational +from sympy.core.symbol import Symbol, symbols +from sympy.core.sympify import sympify +from sympy.functions.elementary.exponential import exp +from sympy.functions.elementary.integers import floor +from sympy.integrals.integrals import integrate +from sympy.polys.polyfuncs import rational_interpolate as rinterp +from sympy.polys.polytools import lcm +from sympy.simplify.radsimp import denom +from sympy.utilities import public + + +@public +def find_simple_recurrence_vector(l): + """ + This function is used internally by other functions from the + sympy.concrete.guess module. While most users may want to rather use the + function find_simple_recurrence when looking for recurrence relations + among rational numbers, the current function may still be useful when + some post-processing has to be done. + + Explanation + =========== + + The function returns a vector of length n when a recurrence relation of + order n is detected in the sequence of rational numbers v. + + If the returned vector has a length 1, then the returned value is always + the list [0], which means that no relation has been found. + + While the functions is intended to be used with rational numbers, it should + work for other kinds of real numbers except for some cases involving + quadratic numbers; for that reason it should be used with some caution when + the argument is not a list of rational numbers. + + Examples + ======== + + >>> from sympy.concrete.guess import find_simple_recurrence_vector + >>> from sympy import fibonacci + >>> find_simple_recurrence_vector([fibonacci(k) for k in range(12)]) + [1, -1, -1] + + See Also + ======== + + See the function sympy.concrete.guess.find_simple_recurrence which is more + user-friendly. + + """ + q1 = [0] + q2 = [1] + b, z = 0, len(l) >> 1 + while len(q2) <= z: + while l[b]==0: + b += 1 + if b == len(l): + c = 1 + for x in q2: + c = lcm(c, denom(x)) + if q2[0]*c < 0: c = -c + for k in range(len(q2)): + q2[k] = int(q2[k]*c) + return q2 + a = S.One/l[b] + m = [a] + for k in range(b+1, len(l)): + m.append(-sum(l[j+1]*m[b-j-1] for j in range(b, k))*a) + l, m = m, [0] * max(len(q2), b+len(q1)) + for k, q in enumerate(q2): + m[k] = a*q + for k, q in enumerate(q1): + m[k+b] += q + while m[-1]==0: m.pop() # because trailing zeros can occur + q1, q2, b = q2, m, 1 + return [0] + +@public +def find_simple_recurrence(v, A=Function('a'), N=Symbol('n')): + """ + Detects and returns a recurrence relation from a sequence of several integer + (or rational) terms. The name of the function in the returned expression is + 'a' by default; the main variable is 'n' by default. The smallest index in + the returned expression is always n (and never n-1, n-2, etc.). + + Examples + ======== + + >>> from sympy.concrete.guess import find_simple_recurrence + >>> from sympy import fibonacci + >>> find_simple_recurrence([fibonacci(k) for k in range(12)]) + -a(n) - a(n + 1) + a(n + 2) + + >>> from sympy import Function, Symbol + >>> a = [1, 1, 1] + >>> for k in range(15): a.append(5*a[-1]-3*a[-2]+8*a[-3]) + >>> find_simple_recurrence(a, A=Function('f'), N=Symbol('i')) + -8*f(i) + 3*f(i + 1) - 5*f(i + 2) + f(i + 3) + + """ + p = find_simple_recurrence_vector(v) + n = len(p) + if n <= 1: return S.Zero + + return Add(*[A(N+n-1-k)*p[k] for k in range(n)]) + + +@public +def rationalize(x, maxcoeff=10000): + """ + Helps identifying a rational number from a float (or mpmath.mpf) value by + using a continued fraction. The algorithm stops as soon as a large partial + quotient is detected (greater than 10000 by default). + + Examples + ======== + + >>> from sympy.concrete.guess import rationalize + >>> from mpmath import cos, pi + >>> rationalize(cos(pi/3)) + 1/2 + + >>> from mpmath import mpf + >>> rationalize(mpf("0.333333333333333")) + 1/3 + + While the function is rather intended to help 'identifying' rational + values, it may be used in some cases for approximating real numbers. + (Though other functions may be more relevant in that case.) + + >>> rationalize(pi, maxcoeff = 250) + 355/113 + + See Also + ======== + + Several other methods can approximate a real number as a rational, like: + + * fractions.Fraction.from_decimal + * fractions.Fraction.from_float + * mpmath.identify + * mpmath.pslq by using the following syntax: mpmath.pslq([x, 1]) + * mpmath.findpoly by using the following syntax: mpmath.findpoly(x, 1) + * sympy.simplify.nsimplify (which is a more general function) + + The main difference between the current function and all these variants is + that control focuses on magnitude of partial quotients here rather than on + global precision of the approximation. If the real is "known to be" a + rational number, the current function should be able to detect it correctly + with the default settings even when denominator is great (unless its + expansion contains unusually big partial quotients) which may occur + when studying sequences of increasing numbers. If the user cares more + on getting simple fractions, other methods may be more convenient. + + """ + p0, p1 = 0, 1 + q0, q1 = 1, 0 + a = floor(x) + while a < maxcoeff or q1==0: + p = a*p1 + p0 + q = a*q1 + q0 + p0, p1 = p1, p + q0, q1 = q1, q + if x==a: break + x = 1/(x-a) + a = floor(x) + return sympify(p) / q + + +@public +def guess_generating_function_rational(v, X=Symbol('x')): + """ + Tries to "guess" a rational generating function for a sequence of rational + numbers v. + + Examples + ======== + + >>> from sympy.concrete.guess import guess_generating_function_rational + >>> from sympy import fibonacci + >>> l = [fibonacci(k) for k in range(5,15)] + >>> guess_generating_function_rational(l) + (3*x + 5)/(-x**2 - x + 1) + + See Also + ======== + + sympy.series.approximants + mpmath.pade + + """ + # a) compute the denominator as q + q = find_simple_recurrence_vector(v) + n = len(q) + if n <= 1: return None + # b) compute the numerator as p + p = [sum(v[i-k]*q[k] for k in range(min(i+1, n))) + for i in range(len(v)>>1)] + return (sum(p[k]*X**k for k in range(len(p))) + / sum(q[k]*X**k for k in range(n))) + + +@public +def guess_generating_function(v, X=Symbol('x'), types=['all'], maxsqrtn=2): + """ + Tries to "guess" a generating function for a sequence of rational numbers v. + Only a few patterns are implemented yet. + + Explanation + =========== + + The function returns a dictionary where keys are the name of a given type of + generating function. Six types are currently implemented: + + type | formal definition + -------+---------------------------------------------------------------- + ogf | f(x) = Sum( a_k * x^k , k: 0..infinity ) + egf | f(x) = Sum( a_k * x^k / k! , k: 0..infinity ) + lgf | f(x) = Sum( (-1)^(k+1) a_k * x^k / k , k: 1..infinity ) + | (with initial index being hold as 1 rather than 0) + hlgf | f(x) = Sum( a_k * x^k / k , k: 1..infinity ) + | (with initial index being hold as 1 rather than 0) + lgdogf | f(x) = derivate( log(Sum( a_k * x^k, k: 0..infinity )), x) + lgdegf | f(x) = derivate( log(Sum( a_k * x^k / k!, k: 0..infinity )), x) + + In order to spare time, the user can select only some types of generating + functions (default being ['all']). While forgetting to use a list in the + case of a single type may seem to work most of the time as in: types='ogf' + this (convenient) syntax may lead to unexpected extra results in some cases. + + Discarding a type when calling the function does not mean that the type will + not be present in the returned dictionary; it only means that no extra + computation will be performed for that type, but the function may still add + it in the result when it can be easily converted from another type. + + Two generating functions (lgdogf and lgdegf) are not even computed if the + initial term of the sequence is 0; it may be useful in that case to try + again after having removed the leading zeros. + + Examples + ======== + + >>> from sympy.concrete.guess import guess_generating_function as ggf + >>> ggf([k+1 for k in range(12)], types=['ogf', 'lgf', 'hlgf']) + {'hlgf': 1/(1 - x), 'lgf': 1/(x + 1), 'ogf': 1/(x**2 - 2*x + 1)} + + >>> from sympy import sympify + >>> l = sympify("[3/2, 11/2, 0, -121/2, -363/2, 121]") + >>> ggf(l) + {'ogf': (x + 3/2)/(11*x**2 - 3*x + 1)} + + >>> from sympy import fibonacci + >>> ggf([fibonacci(k) for k in range(5, 15)], types=['ogf']) + {'ogf': (3*x + 5)/(-x**2 - x + 1)} + + >>> from sympy import factorial + >>> ggf([factorial(k) for k in range(12)], types=['ogf', 'egf', 'lgf']) + {'egf': 1/(1 - x)} + + >>> ggf([k+1 for k in range(12)], types=['egf']) + {'egf': (x + 1)*exp(x), 'lgdegf': (x + 2)/(x + 1)} + + N-th root of a rational function can also be detected (below is an example + coming from the sequence A108626 from https://oeis.org). + The greatest n-th root to be tested is specified as maxsqrtn (default 2). + + >>> ggf([1, 2, 5, 14, 41, 124, 383, 1200, 3799, 12122, 38919])['ogf'] + sqrt(1/(x**4 + 2*x**2 - 4*x + 1)) + + References + ========== + + .. [1] "Concrete Mathematics", R.L. Graham, D.E. Knuth, O. Patashnik + .. [2] https://oeis.org/wiki/Generating_functions + + """ + # List of all types of all g.f. known by the algorithm + if 'all' in types: + types = ('ogf', 'egf', 'lgf', 'hlgf', 'lgdogf', 'lgdegf') + + result = {} + + # Ordinary Generating Function (ogf) + if 'ogf' in types: + # Perform some convolutions of the sequence with itself + t = [1] + [0]*(len(v) - 1) + for d in range(max(1, maxsqrtn)): + t = [sum(t[n-i]*v[i] for i in range(n+1)) for n in range(len(v))] + g = guess_generating_function_rational(t, X=X) + if g: + result['ogf'] = g**Rational(1, d+1) + break + + # Exponential Generating Function (egf) + if 'egf' in types: + # Transform sequence (division by factorial) + w, f = [], S.One + for i, k in enumerate(v): + f *= i if i else 1 + w.append(k/f) + # Perform some convolutions of the sequence with itself + t = [1] + [0]*(len(w) - 1) + for d in range(max(1, maxsqrtn)): + t = [sum(t[n-i]*w[i] for i in range(n+1)) for n in range(len(w))] + g = guess_generating_function_rational(t, X=X) + if g: + result['egf'] = g**Rational(1, d+1) + break + + # Logarithmic Generating Function (lgf) + if 'lgf' in types: + # Transform sequence (multiplication by (-1)^(n+1) / n) + w, f = [], S.NegativeOne + for i, k in enumerate(v): + f = -f + w.append(f*k/Integer(i+1)) + # Perform some convolutions of the sequence with itself + t = [1] + [0]*(len(w) - 1) + for d in range(max(1, maxsqrtn)): + t = [sum(t[n-i]*w[i] for i in range(n+1)) for n in range(len(w))] + g = guess_generating_function_rational(t, X=X) + if g: + result['lgf'] = g**Rational(1, d+1) + break + + # Hyperbolic logarithmic Generating Function (hlgf) + if 'hlgf' in types: + # Transform sequence (division by n+1) + w = [] + for i, k in enumerate(v): + w.append(k/Integer(i+1)) + # Perform some convolutions of the sequence with itself + t = [1] + [0]*(len(w) - 1) + for d in range(max(1, maxsqrtn)): + t = [sum(t[n-i]*w[i] for i in range(n+1)) for n in range(len(w))] + g = guess_generating_function_rational(t, X=X) + if g: + result['hlgf'] = g**Rational(1, d+1) + break + + # Logarithmic derivative of ordinary generating Function (lgdogf) + if v[0] != 0 and ('lgdogf' in types + or ('ogf' in types and 'ogf' not in result)): + # Transform sequence by computing f'(x)/f(x) + # because log(f(x)) = integrate( f'(x)/f(x) ) + a, w = sympify(v[0]), [] + for n in range(len(v)-1): + w.append( + (v[n+1]*(n+1) - sum(w[-i-1]*v[i+1] for i in range(n)))/a) + # Perform some convolutions of the sequence with itself + t = [1] + [0]*(len(w) - 1) + for d in range(max(1, maxsqrtn)): + t = [sum(t[n-i]*w[i] for i in range(n+1)) for n in range(len(w))] + g = guess_generating_function_rational(t, X=X) + if g: + result['lgdogf'] = g**Rational(1, d+1) + if 'ogf' not in result: + result['ogf'] = exp(integrate(result['lgdogf'], X)) + break + + # Logarithmic derivative of exponential generating Function (lgdegf) + if v[0] != 0 and ('lgdegf' in types + or ('egf' in types and 'egf' not in result)): + # Transform sequence / step 1 (division by factorial) + z, f = [], S.One + for i, k in enumerate(v): + f *= i if i else 1 + z.append(k/f) + # Transform sequence / step 2 by computing f'(x)/f(x) + # because log(f(x)) = integrate( f'(x)/f(x) ) + a, w = z[0], [] + for n in range(len(z)-1): + w.append( + (z[n+1]*(n+1) - sum(w[-i-1]*z[i+1] for i in range(n)))/a) + # Perform some convolutions of the sequence with itself + t = [1] + [0]*(len(w) - 1) + for d in range(max(1, maxsqrtn)): + t = [sum(t[n-i]*w[i] for i in range(n+1)) for n in range(len(w))] + g = guess_generating_function_rational(t, X=X) + if g: + result['lgdegf'] = g**Rational(1, d+1) + if 'egf' not in result: + result['egf'] = exp(integrate(result['lgdegf'], X)) + break + + return result + + +@public +def guess(l, all=False, evaluate=True, niter=2, variables=None): + """ + This function is adapted from the Rate.m package for Mathematica + written by Christian Krattenthaler. + It tries to guess a formula from a given sequence of rational numbers. + + Explanation + =========== + + In order to speed up the process, the 'all' variable is set to False by + default, stopping the computation as some results are returned during an + iteration; the variable can be set to True if more iterations are needed + (other formulas may be found; however they may be equivalent to the first + ones). + + Another option is the 'evaluate' variable (default is True); setting it + to False will leave the involved products unevaluated. + + By default, the number of iterations is set to 2 but a greater value (up + to len(l)-1) can be specified with the optional 'niter' variable. + More and more convoluted results are found when the order of the + iteration gets higher: + + * first iteration returns polynomial or rational functions; + * second iteration returns products of rising factorials and their + inverses; + * third iteration returns products of products of rising factorials + and their inverses; + * etc. + + The returned formulas contain symbols i0, i1, i2, ... where the main + variables is i0 (and auxiliary variables are i1, i2, ...). A list of + other symbols can be provided in the 'variables' option; the length of + the least should be the value of 'niter' (more is acceptable but only + the first symbols will be used); in this case, the main variable will be + the first symbol in the list. + + Examples + ======== + + >>> from sympy.concrete.guess import guess + >>> guess([1,2,6,24,120], evaluate=False) + [Product(i1 + 1, (i1, 1, i0 - 1))] + + >>> from sympy import symbols + >>> r = guess([1,2,7,42,429,7436,218348,10850216], niter=4) + >>> i0 = symbols("i0") + >>> [r[0].subs(i0,n).doit() for n in range(1,10)] + [1, 2, 7, 42, 429, 7436, 218348, 10850216, 911835460] + """ + if any(a==0 for a in l[:-1]): + return [] + N = len(l) + niter = min(N-1, niter) + myprod = product if evaluate else Product + g = [] + res = [] + if variables is None: + symb = symbols('i:'+str(niter)) + else: + symb = variables + for k, s in enumerate(symb): + g.append(l) + n, r = len(l), [] + for i in range(n-2-1, -1, -1): + ri = rinterp(enumerate(g[k][:-1], start=1), i, X=s) + if ((denom(ri).subs({s:n}) != 0) + and (ri.subs({s:n}) - g[k][-1] == 0) + and ri not in r): + r.append(ri) + if r: + for i in range(k-1, -1, -1): + r = [g[i][0] + * myprod(v, (symb[i+1], 1, symb[i]-1)) for v in r] + if not all: return r + res += r + l = [Rational(l[i+1], l[i]) for i in range(N-k-1)] + return res diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__pycache__/test_guess.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__pycache__/test_guess.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2ab6e5a68eb5e837f7f8b6fd47ddc46e485d95f Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__pycache__/test_guess.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__pycache__/test_products.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__pycache__/test_products.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3474dcabbc2942320a1629ac27538d328e631f9e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/__pycache__/test_products.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/test_gosper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/test_gosper.py new file mode 100644 index 0000000000000000000000000000000000000000..77b642a9b7cd55f96840a8e20e517206b6a6f8f0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/test_gosper.py @@ -0,0 +1,204 @@ +"""Tests for Gosper's algorithm for hypergeometric summation. """ + +from sympy.core.numbers import (Rational, pi) +from sympy.core.singleton import S +from sympy.core.symbol import Symbol +from sympy.functions.combinatorial.factorials import (binomial, factorial) +from sympy.functions.elementary.exponential import (exp, log) +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.special.gamma_functions import gamma +from sympy.polys.polytools import Poly +from sympy.simplify.simplify import simplify +from sympy.concrete.gosper import gosper_normal, gosper_sum, gosper_term +from sympy.abc import a, b, j, k, m, n, r, x + + +def test_gosper_normal(): + eq = 4*n + 5, 2*(4*n + 1)*(2*n + 3), n + assert gosper_normal(*eq) == \ + (Poly(Rational(1, 4), n), Poly(n + Rational(3, 2)), Poly(n + Rational(1, 4))) + assert gosper_normal(*eq, polys=False) == \ + (Rational(1, 4), n + Rational(3, 2), n + Rational(1, 4)) + + +def test_gosper_term(): + assert gosper_term((4*k + 1)*factorial( + k)/factorial(2*k + 1), k) == (-k - S.Half)/(k + Rational(1, 4)) + + +def test_gosper_sum(): + assert gosper_sum(1, (k, 0, n)) == 1 + n + assert gosper_sum(k, (k, 0, n)) == n*(1 + n)/2 + assert gosper_sum(k**2, (k, 0, n)) == n*(1 + n)*(1 + 2*n)/6 + assert gosper_sum(k**3, (k, 0, n)) == n**2*(1 + n)**2/4 + + assert gosper_sum(2**k, (k, 0, n)) == 2*2**n - 1 + + assert gosper_sum(factorial(k), (k, 0, n)) is None + assert gosper_sum(binomial(n, k), (k, 0, n)) is None + + assert gosper_sum(factorial(k)/k**2, (k, 0, n)) is None + assert gosper_sum((k - 3)*factorial(k), (k, 0, n)) is None + + assert gosper_sum(k*factorial(k), k) == factorial(k) + assert gosper_sum( + k*factorial(k), (k, 0, n)) == n*factorial(n) + factorial(n) - 1 + + assert gosper_sum((-1)**k*binomial(n, k), (k, 0, n)) == 0 + assert gosper_sum(( + -1)**k*binomial(n, k), (k, 0, m)) == -(-1)**m*(m - n)*binomial(n, m)/n + + assert gosper_sum((4*k + 1)*factorial(k)/factorial(2*k + 1), (k, 0, n)) == \ + (2*factorial(2*n + 1) - factorial(n))/factorial(2*n + 1) + + # issue 6033: + assert gosper_sum( + n*(n + a + b)*a**n*b**n/(factorial(n + a)*factorial(n + b)), \ + (n, 0, m)).simplify() == -exp(m*log(a) + m*log(b))*gamma(a + 1) \ + *gamma(b + 1)/(gamma(a)*gamma(b)*gamma(a + m + 1)*gamma(b + m + 1)) \ + + 1/(gamma(a)*gamma(b)) + + +def test_gosper_sum_indefinite(): + assert gosper_sum(k, k) == k*(k - 1)/2 + assert gosper_sum(k**2, k) == k*(k - 1)*(2*k - 1)/6 + + assert gosper_sum(1/(k*(k + 1)), k) == -1/k + assert gosper_sum(-(27*k**4 + 158*k**3 + 430*k**2 + 678*k + 445)*gamma(2*k + + 4)/(3*(3*k + 7)*gamma(3*k + 6)), k) == \ + (3*k + 5)*(k**2 + 2*k + 5)*gamma(2*k + 4)/gamma(3*k + 6) + + +def test_gosper_sum_parametric(): + assert gosper_sum(binomial(S.Half, m - j + 1)*binomial(S.Half, m + j), (j, 1, n)) == \ + n*(1 + m - n)*(-1 + 2*m + 2*n)*binomial(S.Half, 1 + m - n)* \ + binomial(S.Half, m + n)/(m*(1 + 2*m)) + + +def test_gosper_sum_algebraic(): + assert gosper_sum( + n**2 + sqrt(2), (n, 0, m)) == (m + 1)*(2*m**2 + m + 6*sqrt(2))/6 + + +def test_gosper_sum_iterated(): + f1 = binomial(2*k, k)/4**k + f2 = (1 + 2*n)*binomial(2*n, n)/4**n + f3 = (1 + 2*n)*(3 + 2*n)*binomial(2*n, n)/(3*4**n) + f4 = (1 + 2*n)*(3 + 2*n)*(5 + 2*n)*binomial(2*n, n)/(15*4**n) + f5 = (1 + 2*n)*(3 + 2*n)*(5 + 2*n)*(7 + 2*n)*binomial(2*n, n)/(105*4**n) + + assert gosper_sum(f1, (k, 0, n)) == f2 + assert gosper_sum(f2, (n, 0, n)) == f3 + assert gosper_sum(f3, (n, 0, n)) == f4 + assert gosper_sum(f4, (n, 0, n)) == f5 + +# the AeqB tests test expressions given in +# www.math.upenn.edu/~wilf/AeqB.pdf + + +def test_gosper_sum_AeqB_part1(): + f1a = n**4 + f1b = n**3*2**n + f1c = 1/(n**2 + sqrt(5)*n - 1) + f1d = n**4*4**n/binomial(2*n, n) + f1e = factorial(3*n)/(factorial(n)*factorial(n + 1)*factorial(n + 2)*27**n) + f1f = binomial(2*n, n)**2/((n + 1)*4**(2*n)) + f1g = (4*n - 1)*binomial(2*n, n)**2/((2*n - 1)**2*4**(2*n)) + f1h = n*factorial(n - S.Half)**2/factorial(n + 1)**2 + + g1a = m*(m + 1)*(2*m + 1)*(3*m**2 + 3*m - 1)/30 + g1b = 26 + 2**(m + 1)*(m**3 - 3*m**2 + 9*m - 13) + g1c = (m + 1)*(m*(m**2 - 7*m + 3)*sqrt(5) - ( + 3*m**3 - 7*m**2 + 19*m - 6))/(2*m**3*sqrt(5) + m**4 + 5*m**2 - 1)/6 + g1d = Rational(-2, 231) + 2*4**m*(m + 1)*(63*m**4 + 112*m**3 + 18*m**2 - + 22*m + 3)/(693*binomial(2*m, m)) + g1e = Rational(-9, 2) + (81*m**2 + 261*m + 200)*factorial( + 3*m + 2)/(40*27**m*factorial(m)*factorial(m + 1)*factorial(m + 2)) + g1f = (2*m + 1)**2*binomial(2*m, m)**2/(4**(2*m)*(m + 1)) + g1g = -binomial(2*m, m)**2/4**(2*m) + g1h = 4*pi -(2*m + 1)**2*(3*m + 4)*factorial(m - S.Half)**2/factorial(m + 1)**2 + + g = gosper_sum(f1a, (n, 0, m)) + assert g is not None and simplify(g - g1a) == 0 + g = gosper_sum(f1b, (n, 0, m)) + assert g is not None and simplify(g - g1b) == 0 + g = gosper_sum(f1c, (n, 0, m)) + assert g is not None and simplify(g - g1c) == 0 + g = gosper_sum(f1d, (n, 0, m)) + assert g is not None and simplify(g - g1d) == 0 + g = gosper_sum(f1e, (n, 0, m)) + assert g is not None and simplify(g - g1e) == 0 + g = gosper_sum(f1f, (n, 0, m)) + assert g is not None and simplify(g - g1f) == 0 + g = gosper_sum(f1g, (n, 0, m)) + assert g is not None and simplify(g - g1g) == 0 + g = gosper_sum(f1h, (n, 0, m)) + # need to call rewrite(gamma) here because we have terms involving + # factorial(1/2) + assert g is not None and simplify(g - g1h).rewrite(gamma) == 0 + + +def test_gosper_sum_AeqB_part2(): + f2a = n**2*a**n + f2b = (n - r/2)*binomial(r, n) + f2c = factorial(n - 1)**2/(factorial(n - x)*factorial(n + x)) + + g2a = -a*(a + 1)/(a - 1)**3 + a**( + m + 1)*(a**2*m**2 - 2*a*m**2 + m**2 - 2*a*m + 2*m + a + 1)/(a - 1)**3 + g2b = (m - r)*binomial(r, m)/2 + ff = factorial(1 - x)*factorial(1 + x) + g2c = 1/ff*( + 1 - 1/x**2) + factorial(m)**2/(x**2*factorial(m - x)*factorial(m + x)) + + g = gosper_sum(f2a, (n, 0, m)) + assert g is not None and simplify(g - g2a) == 0 + g = gosper_sum(f2b, (n, 0, m)) + assert g is not None and simplify(g - g2b) == 0 + g = gosper_sum(f2c, (n, 1, m)) + assert g is not None and simplify(g - g2c) == 0 + + +def test_gosper_nan(): + a = Symbol('a', positive=True) + b = Symbol('b', positive=True) + n = Symbol('n', integer=True) + m = Symbol('m', integer=True) + f2d = n*(n + a + b)*a**n*b**n/(factorial(n + a)*factorial(n + b)) + g2d = 1/(factorial(a - 1)*factorial( + b - 1)) - a**(m + 1)*b**(m + 1)/(factorial(a + m)*factorial(b + m)) + g = gosper_sum(f2d, (n, 0, m)) + assert simplify(g - g2d) == 0 + + +def test_gosper_sum_AeqB_part3(): + f3a = 1/n**4 + f3b = (6*n + 3)/(4*n**4 + 8*n**3 + 8*n**2 + 4*n + 3) + f3c = 2**n*(n**2 - 2*n - 1)/(n**2*(n + 1)**2) + f3d = n**2*4**n/((n + 1)*(n + 2)) + f3e = 2**n/(n + 1) + f3f = 4*(n - 1)*(n**2 - 2*n - 1)/(n**2*(n + 1)**2*(n - 2)**2*(n - 3)**2) + f3g = (n**4 - 14*n**2 - 24*n - 9)*2**n/(n**2*(n + 1)**2*(n + 2)**2* + (n + 3)**2) + + # g3a -> no closed form + g3b = m*(m + 2)/(2*m**2 + 4*m + 3) + g3c = 2**m/m**2 - 2 + g3d = Rational(2, 3) + 4**(m + 1)*(m - 1)/(m + 2)/3 + # g3e -> no closed form + g3f = -(Rational(-1, 16) + 1/((m - 2)**2*(m + 1)**2)) # the AeqB key is wrong + g3g = Rational(-2, 9) + 2**(m + 1)/((m + 1)**2*(m + 3)**2) + + g = gosper_sum(f3a, (n, 1, m)) + assert g is None + g = gosper_sum(f3b, (n, 1, m)) + assert g is not None and simplify(g - g3b) == 0 + g = gosper_sum(f3c, (n, 1, m - 1)) + assert g is not None and simplify(g - g3c) == 0 + g = gosper_sum(f3d, (n, 1, m)) + assert g is not None and simplify(g - g3d) == 0 + g = gosper_sum(f3e, (n, 0, m - 1)) + assert g is None + g = gosper_sum(f3f, (n, 4, m)) + assert g is not None and simplify(g - g3f) == 0 + g = gosper_sum(f3g, (n, 1, m)) + assert g is not None and simplify(g - g3g) == 0 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/test_guess.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/test_guess.py new file mode 100644 index 0000000000000000000000000000000000000000..5ac5d02b89ad62a70a29bd450b71b284b6aea76d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/concrete/tests/test_guess.py @@ -0,0 +1,82 @@ +from sympy.concrete.guess import ( + find_simple_recurrence_vector, + find_simple_recurrence, + rationalize, + guess_generating_function_rational, + guess_generating_function, + guess + ) +from sympy.concrete.products import Product +from sympy.core.function import Function +from sympy.core.numbers import Rational +from sympy.core.singleton import S +from sympy.core.symbol import (Symbol, symbols) +from sympy.core.sympify import sympify +from sympy.functions.combinatorial.factorials import (RisingFactorial, factorial) +from sympy.functions.combinatorial.numbers import fibonacci +from sympy.functions.elementary.exponential import exp + + +def test_find_simple_recurrence_vector(): + assert find_simple_recurrence_vector( + [fibonacci(k) for k in range(12)]) == [1, -1, -1] + + +def test_find_simple_recurrence(): + a = Function('a') + n = Symbol('n') + assert find_simple_recurrence([fibonacci(k) for k in range(12)]) == ( + -a(n) - a(n + 1) + a(n + 2)) + + f = Function('a') + i = Symbol('n') + a = [1, 1, 1] + for k in range(15): a.append(5*a[-1]-3*a[-2]+8*a[-3]) + assert find_simple_recurrence(a, A=f, N=i) == ( + -8*f(i) + 3*f(i + 1) - 5*f(i + 2) + f(i + 3)) + assert find_simple_recurrence([0, 2, 15, 74, 12, 3, 0, + 1, 2, 85, 4, 5, 63]) == 0 + + +def test_rationalize(): + from mpmath import cos, pi, mpf + assert rationalize(cos(pi/3)) == S.Half + assert rationalize(mpf("0.333333333333333")) == Rational(1, 3) + assert rationalize(mpf("-0.333333333333333")) == Rational(-1, 3) + assert rationalize(pi, maxcoeff = 250) == Rational(355, 113) + + +def test_guess_generating_function_rational(): + x = Symbol('x') + assert guess_generating_function_rational([fibonacci(k) + for k in range(5, 15)]) == ((3*x + 5)/(-x**2 - x + 1)) + + +def test_guess_generating_function(): + x = Symbol('x') + assert guess_generating_function([fibonacci(k) + for k in range(5, 15)])['ogf'] == ((3*x + 5)/(-x**2 - x + 1)) + assert guess_generating_function( + [1, 2, 5, 14, 41, 124, 383, 1200, 3799, 12122, 38919])['ogf'] == ( + (1/(x**4 + 2*x**2 - 4*x + 1))**S.Half) + assert guess_generating_function(sympify( + "[3/2, 11/2, 0, -121/2, -363/2, 121, 4719/2, 11495/2, -8712, -178717/2]") + )['ogf'] == (x + Rational(3, 2))/(11*x**2 - 3*x + 1) + assert guess_generating_function([factorial(k) for k in range(12)], + types=['egf'])['egf'] == 1/(-x + 1) + assert guess_generating_function([k+1 for k in range(12)], + types=['egf']) == {'egf': (x + 1)*exp(x), 'lgdegf': (x + 2)/(x + 1)} + + +def test_guess(): + i0, i1 = symbols('i0 i1') + assert guess([1, 2, 6, 24, 120], evaluate=False) == [Product(i1 + 1, (i1, 1, i0 - 1))] + assert guess([1, 2, 6, 24, 120]) == [RisingFactorial(2, i0 - 1)] + assert guess([1, 2, 7, 42, 429, 7436, 218348, 10850216], niter=4) == [ + 2**(i0 - 1)*(Rational(27, 16))**(i0**2/2 - 3*i0/2 + + 1)*Product(RisingFactorial(Rational(5, 3), i1 - 1)*RisingFactorial(Rational(7, 3), i1 + - 1)/(RisingFactorial(Rational(3, 2), i1 - 1)*RisingFactorial(Rational(5, 2), i1 - + 1)), (i1, 1, i0 - 1))] + assert guess([1, 0, 2]) == [] + x, y = symbols('x y') + assert guess([1, 2, 6, 24, 120], variables=[x, y]) == [RisingFactorial(2, x - 1)] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..37b558f3f03f149dae6af20254e9b88192f7f1ed --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__init__.py @@ -0,0 +1,72 @@ +"""A module that handles matrices. + +Includes functions for fast creating matrices like zero, one/eye, random +matrix, etc. +""" +from .exceptions import ShapeError, NonSquareMatrixError +from .kind import MatrixKind +from .dense import ( + GramSchmidt, casoratian, diag, eye, hessian, jordan_cell, + list2numpy, matrix2numpy, matrix_multiply_elementwise, ones, + randMatrix, rot_axis1, rot_axis2, rot_axis3, rot_ccw_axis1, + rot_ccw_axis2, rot_ccw_axis3, rot_givens, + symarray, wronskian, zeros) +from .dense import MutableDenseMatrix +from .matrixbase import DeferredVector, MatrixBase + +MutableMatrix = MutableDenseMatrix +Matrix = MutableMatrix + +from .sparse import MutableSparseMatrix +from .sparsetools import banded +from .immutable import ImmutableDenseMatrix, ImmutableSparseMatrix + +ImmutableMatrix = ImmutableDenseMatrix +SparseMatrix = MutableSparseMatrix + +from .expressions import ( + MatrixSlice, BlockDiagMatrix, BlockMatrix, FunctionMatrix, Identity, + Inverse, MatAdd, MatMul, MatPow, MatrixExpr, MatrixSymbol, Trace, + Transpose, ZeroMatrix, OneMatrix, blockcut, block_collapse, matrix_symbols, Adjoint, + hadamard_product, HadamardProduct, HadamardPower, Determinant, det, + diagonalize_vector, DiagMatrix, DiagonalMatrix, DiagonalOf, trace, + DotProduct, kronecker_product, KroneckerProduct, + PermutationMatrix, MatrixPermute, MatrixSet, Permanent, per) + +from .utilities import dotprodsimp + +__all__ = [ + 'ShapeError', 'NonSquareMatrixError', 'MatrixKind', + + 'GramSchmidt', 'casoratian', 'diag', 'eye', 'hessian', 'jordan_cell', + 'list2numpy', 'matrix2numpy', 'matrix_multiply_elementwise', 'ones', + 'randMatrix', 'rot_axis1', 'rot_axis2', 'rot_axis3', 'symarray', + 'wronskian', 'zeros', 'rot_ccw_axis1', 'rot_ccw_axis2', 'rot_ccw_axis3', + 'rot_givens', + + 'MutableDenseMatrix', + + 'DeferredVector', 'MatrixBase', + + 'Matrix', 'MutableMatrix', + + 'MutableSparseMatrix', + + 'banded', + + 'ImmutableDenseMatrix', 'ImmutableSparseMatrix', + + 'ImmutableMatrix', 'SparseMatrix', + + 'MatrixSlice', 'BlockDiagMatrix', 'BlockMatrix', 'FunctionMatrix', + 'Identity', 'Inverse', 'MatAdd', 'MatMul', 'MatPow', 'MatrixExpr', + 'MatrixSymbol', 'Trace', 'Transpose', 'ZeroMatrix', 'OneMatrix', + 'blockcut', 'block_collapse', 'matrix_symbols', 'Adjoint', + 'hadamard_product', 'HadamardProduct', 'HadamardPower', 'Determinant', + 'det', 'diagonalize_vector', 'DiagMatrix', 'DiagonalMatrix', + 'DiagonalOf', 'trace', 'DotProduct', 'kronecker_product', + 'KroneckerProduct', 'PermutationMatrix', 'MatrixPermute', 'MatrixSet', + 'Permanent', 'per', + + 'dotprodsimp', +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/decompositions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/decompositions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14fe6ab1e2046705371c5e19cca77738aa91e2a4 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/decompositions.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/dense.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/dense.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..197331a2fe573e6f21766e50af81467567b6f803 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/dense.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/graph.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/graph.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d78951243252f14bd41290c1a4291c9801ac6383 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/graph.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/inverse.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/inverse.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ada2bd30ab8fe3fdba452cc6be039f7b384357c0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/inverse.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/normalforms.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/normalforms.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11be65ab592f0b61a0c80cee5f11ab31621bcaed Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/normalforms.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/reductions.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/reductions.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39489039ec209bc812fcd8ca09f032508a40dc2c Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/reductions.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/sparsetools.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/sparsetools.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18825f6c98e4543fec799a04413e26501f25d8e0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/sparsetools.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/utilities.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/utilities.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..adf573122ce838e56495a413847dc32ca79ccd69 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/__pycache__/utilities.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/dense.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/dense.py new file mode 100644 index 0000000000000000000000000000000000000000..8382da5d9b55249d2c6e2aa645743782a5bd7fa3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/dense.py @@ -0,0 +1,1093 @@ +import random + +from sympy.core.basic import Basic +from sympy.core.singleton import S +from sympy.core.symbol import Symbol +from sympy.core.sympify import sympify +from sympy.functions.elementary.trigonometric import cos, sin +from sympy.utilities.decorator import doctest_depends_on +from sympy.utilities.exceptions import sympy_deprecation_warning +from sympy.utilities.iterables import is_sequence + +from .exceptions import ShapeError +from .decompositions import _cholesky, _LDLdecomposition +from .matrixbase import MatrixBase +from .repmatrix import MutableRepMatrix, RepMatrix +from .solvers import _lower_triangular_solve, _upper_triangular_solve + + +__doctest_requires__ = {('symarray',): ['numpy']} + + +def _iszero(x): + """Returns True if x is zero.""" + return x.is_zero + + +class DenseMatrix(RepMatrix): + """Matrix implementation based on DomainMatrix as the internal representation""" + + # + # DenseMatrix is a superclass for both MutableDenseMatrix and + # ImmutableDenseMatrix. Methods shared by both classes but not for the + # Sparse classes should be implemented here. + # + + is_MatrixExpr = False # type: bool + + _op_priority = 10.01 + _class_priority = 4 + + @property + def _mat(self): + sympy_deprecation_warning( + """ + The private _mat attribute of Matrix is deprecated. Use the + .flat() method instead. + """, + deprecated_since_version="1.9", + active_deprecations_target="deprecated-private-matrix-attributes" + ) + + return self.flat() + + def _eval_inverse(self, **kwargs): + return self.inv(method=kwargs.get('method', 'GE'), + iszerofunc=kwargs.get('iszerofunc', _iszero), + try_block_diag=kwargs.get('try_block_diag', False)) + + def as_immutable(self): + """Returns an Immutable version of this Matrix + """ + from .immutable import ImmutableDenseMatrix as cls + return cls._fromrep(self._rep.copy()) + + def as_mutable(self): + """Returns a mutable version of this matrix + + Examples + ======== + + >>> from sympy import ImmutableMatrix + >>> X = ImmutableMatrix([[1, 2], [3, 4]]) + >>> Y = X.as_mutable() + >>> Y[1, 1] = 5 # Can set values in Y + >>> Y + Matrix([ + [1, 2], + [3, 5]]) + """ + return Matrix(self) + + def cholesky(self, hermitian=True): + return _cholesky(self, hermitian=hermitian) + + def LDLdecomposition(self, hermitian=True): + return _LDLdecomposition(self, hermitian=hermitian) + + def lower_triangular_solve(self, rhs): + return _lower_triangular_solve(self, rhs) + + def upper_triangular_solve(self, rhs): + return _upper_triangular_solve(self, rhs) + + cholesky.__doc__ = _cholesky.__doc__ + LDLdecomposition.__doc__ = _LDLdecomposition.__doc__ + lower_triangular_solve.__doc__ = _lower_triangular_solve.__doc__ + upper_triangular_solve.__doc__ = _upper_triangular_solve.__doc__ + + +def _force_mutable(x): + """Return a matrix as a Matrix, otherwise return x.""" + if getattr(x, 'is_Matrix', False): + return x.as_mutable() + elif isinstance(x, Basic): + return x + elif hasattr(x, '__array__'): + a = x.__array__() + if len(a.shape) == 0: + return sympify(a) + return Matrix(x) + return x + + +class MutableDenseMatrix(DenseMatrix, MutableRepMatrix): + + def simplify(self, **kwargs): + """Applies simplify to the elements of a matrix in place. + + This is a shortcut for M.applyfunc(lambda x: simplify(x, ratio, measure)) + + See Also + ======== + + sympy.simplify.simplify.simplify + """ + from sympy.simplify.simplify import simplify as _simplify + for (i, j), element in self.todok().items(): + self[i, j] = _simplify(element, **kwargs) + + +MutableMatrix = Matrix = MutableDenseMatrix + +########### +# Numpy Utility Functions: +# list2numpy, matrix2numpy, symmarray +########### + + +def list2numpy(l, dtype=object): # pragma: no cover + """Converts Python list of SymPy expressions to a NumPy array. + + See Also + ======== + + matrix2numpy + """ + from numpy import empty + a = empty(len(l), dtype) + for i, s in enumerate(l): + a[i] = s + return a + + +def matrix2numpy(m, dtype=object): # pragma: no cover + """Converts SymPy's matrix to a NumPy array. + + See Also + ======== + + list2numpy + """ + from numpy import empty + a = empty(m.shape, dtype) + for i in range(m.rows): + for j in range(m.cols): + a[i, j] = m[i, j] + return a + + +########### +# Rotation matrices: +# rot_givens, rot_axis[123], rot_ccw_axis[123] +########### + + +def rot_givens(i, j, theta, dim=3): + r"""Returns a a Givens rotation matrix, a a rotation in the + plane spanned by two coordinates axes. + + Explanation + =========== + + The Givens rotation corresponds to a generalization of rotation + matrices to any number of dimensions, given by: + + .. math:: + G(i, j, \theta) = + \begin{bmatrix} + 1 & \cdots & 0 & \cdots & 0 & \cdots & 0 \\ + \vdots & \ddots & \vdots & & \vdots & & \vdots \\ + 0 & \cdots & c & \cdots & -s & \cdots & 0 \\ + \vdots & & \vdots & \ddots & \vdots & & \vdots \\ + 0 & \cdots & s & \cdots & c & \cdots & 0 \\ + \vdots & & \vdots & & \vdots & \ddots & \vdots \\ + 0 & \cdots & 0 & \cdots & 0 & \cdots & 1 + \end{bmatrix} + + Where $c = \cos(\theta)$ and $s = \sin(\theta)$ appear at the intersections + ``i``\th and ``j``\th rows and columns. + + For fixed ``i > j``\, the non-zero elements of a Givens matrix are + given by: + + - $g_{kk} = 1$ for $k \ne i,\,j$ + - $g_{kk} = c$ for $k = i,\,j$ + - $g_{ji} = -g_{ij} = -s$ + + Parameters + ========== + + i : int between ``0`` and ``dim - 1`` + Represents first axis + j : int between ``0`` and ``dim - 1`` + Represents second axis + dim : int bigger than 1 + Number of dimentions. Defaults to 3. + + Examples + ======== + + >>> from sympy import pi, rot_givens + + A counterclockwise rotation of pi/3 (60 degrees) around + the third axis (z-axis): + + >>> rot_givens(1, 0, pi/3) + Matrix([ + [ 1/2, -sqrt(3)/2, 0], + [sqrt(3)/2, 1/2, 0], + [ 0, 0, 1]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_givens(1, 0, pi/2) + Matrix([ + [0, -1, 0], + [1, 0, 0], + [0, 0, 1]]) + + This can be generalized to any number + of dimensions: + + >>> rot_givens(1, 0, pi/2, dim=4) + Matrix([ + [0, -1, 0, 0], + [1, 0, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1]]) + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Givens_rotation + + See Also + ======== + + rot_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (clockwise around the x axis) + rot_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (clockwise around the y axis) + rot_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (clockwise around the z axis) + rot_ccw_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (counterclockwise around the x axis) + rot_ccw_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (counterclockwise around the y axis) + rot_ccw_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (counterclockwise around the z axis) + """ + if not isinstance(dim, int) or dim < 2: + raise ValueError('dim must be an integer biggen than one, ' + 'got {}.'.format(dim)) + + if i == j: + raise ValueError('i and j must be different, ' + 'got ({}, {})'.format(i, j)) + + for ij in [i, j]: + if not isinstance(ij, int) or ij < 0 or ij > dim - 1: + raise ValueError('i and j must be integers between 0 and ' + '{}, got i={} and j={}.'.format(dim-1, i, j)) + + theta = sympify(theta) + c = cos(theta) + s = sin(theta) + M = eye(dim) + M[i, i] = c + M[j, j] = c + M[i, j] = s + M[j, i] = -s + return M + + +def rot_axis3(theta): + r"""Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis. + + Explanation + =========== + + For a right-handed coordinate system, this corresponds to a + clockwise rotation around the `z`-axis, given by: + + .. math:: + + R = \begin{bmatrix} + \cos(\theta) & \sin(\theta) & 0 \\ + -\sin(\theta) & \cos(\theta) & 0 \\ + 0 & 0 & 1 + \end{bmatrix} + + Examples + ======== + + >>> from sympy import pi, rot_axis3 + + A rotation of pi/3 (60 degrees): + + >>> theta = pi/3 + >>> rot_axis3(theta) + Matrix([ + [ 1/2, sqrt(3)/2, 0], + [-sqrt(3)/2, 1/2, 0], + [ 0, 0, 1]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_axis3(pi/2) + Matrix([ + [ 0, 1, 0], + [-1, 0, 0], + [ 0, 0, 1]]) + + See Also + ======== + + rot_givens: Returns a Givens rotation matrix (generalized rotation for + any number of dimensions) + rot_ccw_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (counterclockwise around the z axis) + rot_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (clockwise around the x axis) + rot_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (clockwise around the y axis) + """ + return rot_givens(0, 1, theta, dim=3) + + +def rot_axis2(theta): + r"""Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis. + + Explanation + =========== + + For a right-handed coordinate system, this corresponds to a + clockwise rotation around the `y`-axis, given by: + + .. math:: + + R = \begin{bmatrix} + \cos(\theta) & 0 & -\sin(\theta) \\ + 0 & 1 & 0 \\ + \sin(\theta) & 0 & \cos(\theta) + \end{bmatrix} + + Examples + ======== + + >>> from sympy import pi, rot_axis2 + + A rotation of pi/3 (60 degrees): + + >>> theta = pi/3 + >>> rot_axis2(theta) + Matrix([ + [ 1/2, 0, -sqrt(3)/2], + [ 0, 1, 0], + [sqrt(3)/2, 0, 1/2]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_axis2(pi/2) + Matrix([ + [0, 0, -1], + [0, 1, 0], + [1, 0, 0]]) + + See Also + ======== + + rot_givens: Returns a Givens rotation matrix (generalized rotation for + any number of dimensions) + rot_ccw_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (clockwise around the y axis) + rot_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (counterclockwise around the x axis) + rot_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (counterclockwise around the z axis) + """ + return rot_givens(2, 0, theta, dim=3) + + +def rot_axis1(theta): + r"""Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis. + + Explanation + =========== + + For a right-handed coordinate system, this corresponds to a + clockwise rotation around the `x`-axis, given by: + + .. math:: + + R = \begin{bmatrix} + 1 & 0 & 0 \\ + 0 & \cos(\theta) & \sin(\theta) \\ + 0 & -\sin(\theta) & \cos(\theta) + \end{bmatrix} + + Examples + ======== + + >>> from sympy import pi, rot_axis1 + + A rotation of pi/3 (60 degrees): + + >>> theta = pi/3 + >>> rot_axis1(theta) + Matrix([ + [1, 0, 0], + [0, 1/2, sqrt(3)/2], + [0, -sqrt(3)/2, 1/2]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_axis1(pi/2) + Matrix([ + [1, 0, 0], + [0, 0, 1], + [0, -1, 0]]) + + See Also + ======== + + rot_givens: Returns a Givens rotation matrix (generalized rotation for + any number of dimensions) + rot_ccw_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (counterclockwise around the x axis) + rot_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (clockwise around the y axis) + rot_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (clockwise around the z axis) + """ + return rot_givens(1, 2, theta, dim=3) + + +def rot_ccw_axis3(theta): + r"""Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis. + + Explanation + =========== + + For a right-handed coordinate system, this corresponds to a + counterclockwise rotation around the `z`-axis, given by: + + .. math:: + + R = \begin{bmatrix} + \cos(\theta) & -\sin(\theta) & 0 \\ + \sin(\theta) & \cos(\theta) & 0 \\ + 0 & 0 & 1 + \end{bmatrix} + + Examples + ======== + + >>> from sympy import pi, rot_ccw_axis3 + + A rotation of pi/3 (60 degrees): + + >>> theta = pi/3 + >>> rot_ccw_axis3(theta) + Matrix([ + [ 1/2, -sqrt(3)/2, 0], + [sqrt(3)/2, 1/2, 0], + [ 0, 0, 1]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_ccw_axis3(pi/2) + Matrix([ + [0, -1, 0], + [1, 0, 0], + [0, 0, 1]]) + + See Also + ======== + + rot_givens: Returns a Givens rotation matrix (generalized rotation for + any number of dimensions) + rot_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (clockwise around the z axis) + rot_ccw_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (counterclockwise around the x axis) + rot_ccw_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (counterclockwise around the y axis) + """ + return rot_givens(1, 0, theta, dim=3) + + +def rot_ccw_axis2(theta): + r"""Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis. + + Explanation + =========== + + For a right-handed coordinate system, this corresponds to a + counterclockwise rotation around the `y`-axis, given by: + + .. math:: + + R = \begin{bmatrix} + \cos(\theta) & 0 & \sin(\theta) \\ + 0 & 1 & 0 \\ + -\sin(\theta) & 0 & \cos(\theta) + \end{bmatrix} + + Examples + ======== + + >>> from sympy import pi, rot_ccw_axis2 + + A rotation of pi/3 (60 degrees): + + >>> theta = pi/3 + >>> rot_ccw_axis2(theta) + Matrix([ + [ 1/2, 0, sqrt(3)/2], + [ 0, 1, 0], + [-sqrt(3)/2, 0, 1/2]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_ccw_axis2(pi/2) + Matrix([ + [ 0, 0, 1], + [ 0, 1, 0], + [-1, 0, 0]]) + + See Also + ======== + + rot_givens: Returns a Givens rotation matrix (generalized rotation for + any number of dimensions) + rot_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (clockwise around the y axis) + rot_ccw_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (counterclockwise around the x axis) + rot_ccw_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (counterclockwise around the z axis) + """ + return rot_givens(0, 2, theta, dim=3) + + +def rot_ccw_axis1(theta): + r"""Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis. + + Explanation + =========== + + For a right-handed coordinate system, this corresponds to a + counterclockwise rotation around the `x`-axis, given by: + + .. math:: + + R = \begin{bmatrix} + 1 & 0 & 0 \\ + 0 & \cos(\theta) & -\sin(\theta) \\ + 0 & \sin(\theta) & \cos(\theta) + \end{bmatrix} + + Examples + ======== + + >>> from sympy import pi, rot_ccw_axis1 + + A rotation of pi/3 (60 degrees): + + >>> theta = pi/3 + >>> rot_ccw_axis1(theta) + Matrix([ + [1, 0, 0], + [0, 1/2, -sqrt(3)/2], + [0, sqrt(3)/2, 1/2]]) + + If we rotate by pi/2 (90 degrees): + + >>> rot_ccw_axis1(pi/2) + Matrix([ + [1, 0, 0], + [0, 0, -1], + [0, 1, 0]]) + + See Also + ======== + + rot_givens: Returns a Givens rotation matrix (generalized rotation for + any number of dimensions) + rot_axis1: Returns a rotation matrix for a rotation of theta (in radians) + about the 1-axis (clockwise around the x axis) + rot_ccw_axis2: Returns a rotation matrix for a rotation of theta (in radians) + about the 2-axis (counterclockwise around the y axis) + rot_ccw_axis3: Returns a rotation matrix for a rotation of theta (in radians) + about the 3-axis (counterclockwise around the z axis) + """ + return rot_givens(2, 1, theta, dim=3) + + +@doctest_depends_on(modules=('numpy',)) +def symarray(prefix, shape, **kwargs): # pragma: no cover + r"""Create a numpy ndarray of symbols (as an object array). + + The created symbols are named ``prefix_i1_i2_``... You should thus provide a + non-empty prefix if you want your symbols to be unique for different output + arrays, as SymPy symbols with identical names are the same object. + + Parameters + ---------- + + prefix : string + A prefix prepended to the name of every symbol. + + shape : int or tuple + Shape of the created array. If an int, the array is one-dimensional; for + more than one dimension the shape must be a tuple. + + \*\*kwargs : dict + keyword arguments passed on to Symbol + + Examples + ======== + These doctests require numpy. + + >>> from sympy import symarray + >>> symarray('', 3) + [_0 _1 _2] + + If you want multiple symarrays to contain distinct symbols, you *must* + provide unique prefixes: + + >>> a = symarray('', 3) + >>> b = symarray('', 3) + >>> a[0] == b[0] + True + >>> a = symarray('a', 3) + >>> b = symarray('b', 3) + >>> a[0] == b[0] + False + + Creating symarrays with a prefix: + + >>> symarray('a', 3) + [a_0 a_1 a_2] + + For more than one dimension, the shape must be given as a tuple: + + >>> symarray('a', (2, 3)) + [[a_0_0 a_0_1 a_0_2] + [a_1_0 a_1_1 a_1_2]] + >>> symarray('a', (2, 3, 2)) + [[[a_0_0_0 a_0_0_1] + [a_0_1_0 a_0_1_1] + [a_0_2_0 a_0_2_1]] + + [[a_1_0_0 a_1_0_1] + [a_1_1_0 a_1_1_1] + [a_1_2_0 a_1_2_1]]] + + For setting assumptions of the underlying Symbols: + + >>> [s.is_real for s in symarray('a', 2, real=True)] + [True, True] + """ + from numpy import empty, ndindex + arr = empty(shape, dtype=object) + for index in ndindex(shape): + arr[index] = Symbol('%s_%s' % (prefix, '_'.join(map(str, index))), + **kwargs) + return arr + + +############### +# Functions +############### + +def casoratian(seqs, n, zero=True): + """Given linear difference operator L of order 'k' and homogeneous + equation Ly = 0 we want to compute kernel of L, which is a set + of 'k' sequences: a(n), b(n), ... z(n). + + Solutions of L are linearly independent iff their Casoratian, + denoted as C(a, b, ..., z), do not vanish for n = 0. + + Casoratian is defined by k x k determinant:: + + + a(n) b(n) . . . z(n) + + | a(n+1) b(n+1) . . . z(n+1) | + | . . . . | + | . . . . | + | . . . . | + + a(n+k-1) b(n+k-1) . . . z(n+k-1) + + + It proves very useful in rsolve_hyper() where it is applied + to a generating set of a recurrence to factor out linearly + dependent solutions and return a basis: + + >>> from sympy import Symbol, casoratian, factorial + >>> n = Symbol('n', integer=True) + + Exponential and factorial are linearly independent: + + >>> casoratian([2**n, factorial(n)], n) != 0 + True + + """ + + seqs = list(map(sympify, seqs)) + + if not zero: + f = lambda i, j: seqs[j].subs(n, n + i) + else: + f = lambda i, j: seqs[j].subs(n, i) + + k = len(seqs) + + return Matrix(k, k, f).det() + + +def eye(*args, **kwargs): + """Create square identity matrix n x n + + See Also + ======== + + diag + zeros + ones + """ + + return Matrix.eye(*args, **kwargs) + + +def diag(*values, strict=True, unpack=False, **kwargs): + """Returns a matrix with the provided values placed on the + diagonal. If non-square matrices are included, they will + produce a block-diagonal matrix. + + Examples + ======== + + This version of diag is a thin wrapper to Matrix.diag that differs + in that it treats all lists like matrices -- even when a single list + is given. If this is not desired, either put a `*` before the list or + set `unpack=True`. + + >>> from sympy import diag + + >>> diag([1, 2, 3], unpack=True) # = diag(1,2,3) or diag(*[1,2,3]) + Matrix([ + [1, 0, 0], + [0, 2, 0], + [0, 0, 3]]) + + >>> diag([1, 2, 3]) # a column vector + Matrix([ + [1], + [2], + [3]]) + + See Also + ======== + .matrixbase.MatrixBase.eye + .matrixbase.MatrixBase.diagonal + .matrixbase.MatrixBase.diag + .expressions.blockmatrix.BlockMatrix + """ + return Matrix.diag(*values, strict=strict, unpack=unpack, **kwargs) + + +def GramSchmidt(vlist, orthonormal=False): + """Apply the Gram-Schmidt process to a set of vectors. + + Parameters + ========== + + vlist : List of Matrix + Vectors to be orthogonalized for. + + orthonormal : Bool, optional + If true, return an orthonormal basis. + + Returns + ======= + + vlist : List of Matrix + Orthogonalized vectors + + Notes + ===== + + This routine is mostly duplicate from ``Matrix.orthogonalize``, + except for some difference that this always raises error when + linearly dependent vectors are found, and the keyword ``normalize`` + has been named as ``orthonormal`` in this function. + + See Also + ======== + + .matrixbase.MatrixBase.orthogonalize + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Gram%E2%80%93Schmidt_process + """ + return MutableDenseMatrix.orthogonalize( + *vlist, normalize=orthonormal, rankcheck=True + ) + + +def hessian(f, varlist, constraints=()): + """Compute Hessian matrix for a function f wrt parameters in varlist + which may be given as a sequence or a row/column vector. A list of + constraints may optionally be given. + + Examples + ======== + + >>> from sympy import Function, hessian, pprint + >>> from sympy.abc import x, y + >>> f = Function('f')(x, y) + >>> g1 = Function('g')(x, y) + >>> g2 = x**2 + 3*y + >>> pprint(hessian(f, (x, y), [g1, g2])) + [ d d ] + [ 0 0 --(g(x, y)) --(g(x, y)) ] + [ dx dy ] + [ ] + [ 0 0 2*x 3 ] + [ ] + [ 2 2 ] + [d d d ] + [--(g(x, y)) 2*x ---(f(x, y)) -----(f(x, y))] + [dx 2 dy dx ] + [ dx ] + [ ] + [ 2 2 ] + [d d d ] + [--(g(x, y)) 3 -----(f(x, y)) ---(f(x, y)) ] + [dy dy dx 2 ] + [ dy ] + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Hessian_matrix + + See Also + ======== + + sympy.matrices.matrixbase.MatrixBase.jacobian + wronskian + """ + # f is the expression representing a function f, return regular matrix + if isinstance(varlist, MatrixBase): + if 1 not in varlist.shape: + raise ShapeError("`varlist` must be a column or row vector.") + if varlist.cols == 1: + varlist = varlist.T + varlist = varlist.tolist()[0] + if is_sequence(varlist): + n = len(varlist) + if not n: + raise ShapeError("`len(varlist)` must not be zero.") + else: + raise ValueError("Improper variable list in hessian function") + if not getattr(f, 'diff'): + # check differentiability + raise ValueError("Function `f` (%s) is not differentiable" % f) + m = len(constraints) + N = m + n + out = zeros(N) + for k, g in enumerate(constraints): + if not getattr(g, 'diff'): + # check differentiability + raise ValueError("Function `f` (%s) is not differentiable" % f) + for i in range(n): + out[k, i + m] = g.diff(varlist[i]) + for i in range(n): + for j in range(i, n): + out[i + m, j + m] = f.diff(varlist[i]).diff(varlist[j]) + for i in range(N): + for j in range(i + 1, N): + out[j, i] = out[i, j] + return out + + +def jordan_cell(eigenval, n): + """ + Create a Jordan block: + + Examples + ======== + + >>> from sympy import jordan_cell + >>> from sympy.abc import x + >>> jordan_cell(x, 4) + Matrix([ + [x, 1, 0, 0], + [0, x, 1, 0], + [0, 0, x, 1], + [0, 0, 0, x]]) + """ + + return Matrix.jordan_block(size=n, eigenvalue=eigenval) + + +def matrix_multiply_elementwise(A, B): + """Return the Hadamard product (elementwise product) of A and B + + >>> from sympy import Matrix, matrix_multiply_elementwise + >>> A = Matrix([[0, 1, 2], [3, 4, 5]]) + >>> B = Matrix([[1, 10, 100], [100, 10, 1]]) + >>> matrix_multiply_elementwise(A, B) + Matrix([ + [ 0, 10, 200], + [300, 40, 5]]) + + See Also + ======== + + sympy.matrices.matrixbase.MatrixBase.__mul__ + """ + return A.multiply_elementwise(B) + + +def ones(*args, **kwargs): + """Returns a matrix of ones with ``rows`` rows and ``cols`` columns; + if ``cols`` is omitted a square matrix will be returned. + + See Also + ======== + + zeros + eye + diag + """ + + if 'c' in kwargs: + kwargs['cols'] = kwargs.pop('c') + + return Matrix.ones(*args, **kwargs) + + +def randMatrix(r, c=None, min=0, max=99, seed=None, symmetric=False, + percent=100, prng=None): + """Create random matrix with dimensions ``r`` x ``c``. If ``c`` is omitted + the matrix will be square. If ``symmetric`` is True the matrix must be + square. If ``percent`` is less than 100 then only approximately the given + percentage of elements will be non-zero. + + The pseudo-random number generator used to generate matrix is chosen in the + following way. + + * If ``prng`` is supplied, it will be used as random number generator. + It should be an instance of ``random.Random``, or at least have + ``randint`` and ``shuffle`` methods with same signatures. + * if ``prng`` is not supplied but ``seed`` is supplied, then new + ``random.Random`` with given ``seed`` will be created; + * otherwise, a new ``random.Random`` with default seed will be used. + + Examples + ======== + + >>> from sympy import randMatrix + >>> randMatrix(3) # doctest:+SKIP + [25, 45, 27] + [44, 54, 9] + [23, 96, 46] + >>> randMatrix(3, 2) # doctest:+SKIP + [87, 29] + [23, 37] + [90, 26] + >>> randMatrix(3, 3, 0, 2) # doctest:+SKIP + [0, 2, 0] + [2, 0, 1] + [0, 0, 1] + >>> randMatrix(3, symmetric=True) # doctest:+SKIP + [85, 26, 29] + [26, 71, 43] + [29, 43, 57] + >>> A = randMatrix(3, seed=1) + >>> B = randMatrix(3, seed=2) + >>> A == B + False + >>> A == randMatrix(3, seed=1) + True + >>> randMatrix(3, symmetric=True, percent=50) # doctest:+SKIP + [77, 70, 0], + [70, 0, 0], + [ 0, 0, 88] + """ + # Note that ``Random()`` is equivalent to ``Random(None)`` + prng = prng or random.Random(seed) + + if c is None: + c = r + + if symmetric and r != c: + raise ValueError('For symmetric matrices, r must equal c, but %i != %i' % (r, c)) + + ij = range(r * c) + if percent != 100: + ij = prng.sample(ij, int(len(ij)*percent // 100)) + + m = zeros(r, c) + + if not symmetric: + for ijk in ij: + i, j = divmod(ijk, c) + m[i, j] = prng.randint(min, max) + else: + for ijk in ij: + i, j = divmod(ijk, c) + if i <= j: + m[i, j] = m[j, i] = prng.randint(min, max) + + return m + + +def wronskian(functions, var, method='bareiss'): + """ + Compute Wronskian for [] of functions + + :: + + | f1 f2 ... fn | + | f1' f2' ... fn' | + | . . . . | + W(f1, ..., fn) = | . . . . | + | . . . . | + | (n) (n) (n) | + | D (f1) D (f2) ... D (fn) | + + see: https://en.wikipedia.org/wiki/Wronskian + + See Also + ======== + + sympy.matrices.matrixbase.MatrixBase.jacobian + hessian + """ + + functions = [sympify(f) for f in functions] + n = len(functions) + if n == 0: + return S.One + W = Matrix(n, n, lambda i, j: functions[i].diff(var, j)) + return W.det(method) + + +def zeros(*args, **kwargs): + """Returns a matrix of zeros with ``rows`` rows and ``cols`` columns; + if ``cols`` is omitted a square matrix will be returned. + + See Also + ======== + + ones + eye + diag + """ + + if 'c' in kwargs: + kwargs['cols'] = kwargs.pop('c') + + return Matrix.zeros(*args, **kwargs) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/eigen.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/eigen.py new file mode 100644 index 0000000000000000000000000000000000000000..87b2418efcece1c0b158ec56995bb011286feb3c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/eigen.py @@ -0,0 +1,1346 @@ +from types import FunctionType +from collections import Counter + +from mpmath import mp, workprec +from mpmath.libmp.libmpf import prec_to_dps + +from sympy.core.sorting import default_sort_key +from sympy.core.evalf import DEFAULT_MAXPREC, PrecisionExhausted +from sympy.core.logic import fuzzy_and, fuzzy_or +from sympy.core.numbers import Float +from sympy.core.sympify import _sympify +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.polys import roots, CRootOf, ZZ, QQ, EX +from sympy.polys.matrices import DomainMatrix +from sympy.polys.matrices.eigen import dom_eigenvects, dom_eigenvects_to_sympy +from sympy.polys.polytools import gcd + +from .exceptions import MatrixError, NonSquareMatrixError +from .determinant import _find_reasonable_pivot + +from .utilities import _iszero, _simplify + + +__doctest_requires__ = { + ('_is_indefinite', + '_is_negative_definite', + '_is_negative_semidefinite', + '_is_positive_definite', + '_is_positive_semidefinite'): ['matplotlib'], +} + + +def _eigenvals_eigenvects_mpmath(M): + norm2 = lambda v: mp.sqrt(sum(i**2 for i in v)) + + v1 = None + prec = max(x._prec for x in M.atoms(Float)) + eps = 2**-prec + + while prec < DEFAULT_MAXPREC: + with workprec(prec): + A = mp.matrix(M.evalf(n=prec_to_dps(prec))) + E, ER = mp.eig(A) + v2 = norm2([i for e in E for i in (mp.re(e), mp.im(e))]) + if v1 is not None and mp.fabs(v1 - v2) < eps: + return E, ER + v1 = v2 + prec *= 2 + + # we get here because the next step would have taken us + # past MAXPREC or because we never took a step; in case + # of the latter, we refuse to send back a solution since + # it would not have been verified; we also resist taking + # a small step to arrive exactly at MAXPREC since then + # the two calculations might be artificially close. + raise PrecisionExhausted + + +def _eigenvals_mpmath(M, multiple=False): + """Compute eigenvalues using mpmath""" + E, _ = _eigenvals_eigenvects_mpmath(M) + result = [_sympify(x) for x in E] + if multiple: + return result + return dict(Counter(result)) + + +def _eigenvects_mpmath(M): + E, ER = _eigenvals_eigenvects_mpmath(M) + result = [] + for i in range(M.rows): + eigenval = _sympify(E[i]) + eigenvect = _sympify(ER[:, i]) + result.append((eigenval, 1, [eigenvect])) + + return result + + +# This function is a candidate for caching if it gets implemented for matrices. +def _eigenvals( + M, error_when_incomplete=True, *, simplify=False, multiple=False, + rational=False, **flags): + r"""Compute eigenvalues of the matrix. + + Parameters + ========== + + error_when_incomplete : bool, optional + If it is set to ``True``, it will raise an error if not all + eigenvalues are computed. This is caused by ``roots`` not returning + a full list of eigenvalues. + + simplify : bool or function, optional + If it is set to ``True``, it attempts to return the most + simplified form of expressions returned by applying default + simplification method in every routine. + + If it is set to ``False``, it will skip simplification in this + particular routine to save computation resources. + + If a function is passed to, it will attempt to apply + the particular function as simplification method. + + rational : bool, optional + If it is set to ``True``, every floating point numbers would be + replaced with rationals before computation. It can solve some + issues of ``roots`` routine not working well with floats. + + multiple : bool, optional + If it is set to ``True``, the result will be in the form of a + list. + + If it is set to ``False``, the result will be in the form of a + dictionary. + + Returns + ======= + + eigs : list or dict + Eigenvalues of a matrix. The return format would be specified by + the key ``multiple``. + + Raises + ====== + + MatrixError + If not enough roots had got computed. + + NonSquareMatrixError + If attempted to compute eigenvalues from a non-square matrix. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix(3, 3, [0, 1, 1, 1, 0, 0, 1, 1, 1]) + >>> M.eigenvals() + {-1: 1, 0: 1, 2: 1} + + See Also + ======== + + MatrixBase.charpoly + eigenvects + + Notes + ===== + + Eigenvalues of a matrix $A$ can be computed by solving a matrix + equation $\det(A - \lambda I) = 0$ + + It's not always possible to return radical solutions for + eigenvalues for matrices larger than $4, 4$ shape due to + Abel-Ruffini theorem. + + If there is no radical solution is found for the eigenvalue, + it may return eigenvalues in the form of + :class:`sympy.polys.rootoftools.ComplexRootOf`. + """ + if not M: + if multiple: + return [] + return {} + + if not M.is_square: + raise NonSquareMatrixError("{} must be a square matrix.".format(M)) + + if M._rep.domain not in (ZZ, QQ): + # Skip this check for ZZ/QQ because it can be slow + if all(x.is_number for x in M) and M.has(Float): + return _eigenvals_mpmath(M, multiple=multiple) + + if rational: + from sympy.simplify import nsimplify + M = M.applyfunc( + lambda x: nsimplify(x, rational=True) if x.has(Float) else x) + + if multiple: + return _eigenvals_list( + M, error_when_incomplete=error_when_incomplete, simplify=simplify, + **flags) + return _eigenvals_dict( + M, error_when_incomplete=error_when_incomplete, simplify=simplify, + **flags) + + +eigenvals_error_message = \ +"It is not always possible to express the eigenvalues of a matrix " + \ +"of size 5x5 or higher in radicals. " + \ +"We have CRootOf, but domains other than the rationals are not " + \ +"currently supported. " + \ +"If there are no symbols in the matrix, " + \ +"it should still be possible to compute numeric approximations " + \ +"of the eigenvalues using " + \ +"M.evalf().eigenvals() or M.charpoly().nroots()." + + +def _eigenvals_list( + M, error_when_incomplete=True, simplify=False, **flags): + iblocks = M.strongly_connected_components() + all_eigs = [] + is_dom = M._rep.domain in (ZZ, QQ) + for b in iblocks: + + # Fast path for a 1x1 block: + if is_dom and len(b) == 1: + index = b[0] + val = M[index, index] + all_eigs.append(val) + continue + + block = M[b, b] + + if isinstance(simplify, FunctionType): + charpoly = block.charpoly(simplify=simplify) + else: + charpoly = block.charpoly() + + eigs = roots(charpoly, multiple=True, **flags) + + if len(eigs) != block.rows: + try: + eigs = charpoly.all_roots(multiple=True) + except NotImplementedError: + if error_when_incomplete: + raise MatrixError(eigenvals_error_message) + else: + eigs = [] + + all_eigs += eigs + + if not simplify: + return all_eigs + if not isinstance(simplify, FunctionType): + simplify = _simplify + return [simplify(value) for value in all_eigs] + + +def _eigenvals_dict( + M, error_when_incomplete=True, simplify=False, **flags): + iblocks = M.strongly_connected_components() + all_eigs = {} + is_dom = M._rep.domain in (ZZ, QQ) + for b in iblocks: + + # Fast path for a 1x1 block: + if is_dom and len(b) == 1: + index = b[0] + val = M[index, index] + all_eigs[val] = all_eigs.get(val, 0) + 1 + continue + + block = M[b, b] + + if isinstance(simplify, FunctionType): + charpoly = block.charpoly(simplify=simplify) + else: + charpoly = block.charpoly() + + eigs = roots(charpoly, multiple=False, **flags) + + if sum(eigs.values()) != block.rows: + try: + eigs = dict(charpoly.all_roots(multiple=False)) + except NotImplementedError: + if error_when_incomplete: + raise MatrixError(eigenvals_error_message) + else: + eigs = {} + + for k, v in eigs.items(): + if k in all_eigs: + all_eigs[k] += v + else: + all_eigs[k] = v + + if not simplify: + return all_eigs + if not isinstance(simplify, FunctionType): + simplify = _simplify + return {simplify(key): value for key, value in all_eigs.items()} + + +def _eigenspace(M, eigenval, iszerofunc=_iszero, simplify=False): + """Get a basis for the eigenspace for a particular eigenvalue""" + m = M - M.eye(M.rows) * eigenval + ret = m.nullspace(iszerofunc=iszerofunc) + + # The nullspace for a real eigenvalue should be non-trivial. + # If we didn't find an eigenvector, try once more a little harder + if len(ret) == 0 and simplify: + ret = m.nullspace(iszerofunc=iszerofunc, simplify=True) + if len(ret) == 0: + raise NotImplementedError( + "Can't evaluate eigenvector for eigenvalue {}".format(eigenval)) + return ret + + +def _eigenvects_DOM(M, **kwargs): + DOM = DomainMatrix.from_Matrix(M, field=True, extension=True) + DOM = DOM.to_dense() + + if DOM.domain != EX: + rational, algebraic = dom_eigenvects(DOM) + eigenvects = dom_eigenvects_to_sympy( + rational, algebraic, M.__class__, **kwargs) + eigenvects = sorted(eigenvects, key=lambda x: default_sort_key(x[0])) + + return eigenvects + return None + + +def _eigenvects_sympy(M, iszerofunc, simplify=True, **flags): + eigenvals = M.eigenvals(rational=False, **flags) + + # Make sure that we have all roots in radical form + for x in eigenvals: + if x.has(CRootOf): + raise MatrixError( + "Eigenvector computation is not implemented if the matrix have " + "eigenvalues in CRootOf form") + + eigenvals = sorted(eigenvals.items(), key=default_sort_key) + ret = [] + for val, mult in eigenvals: + vects = _eigenspace(M, val, iszerofunc=iszerofunc, simplify=simplify) + ret.append((val, mult, vects)) + return ret + + +# This functions is a candidate for caching if it gets implemented for matrices. +def _eigenvects(M, error_when_incomplete=True, iszerofunc=_iszero, *, chop=False, **flags): + """Compute eigenvectors of the matrix. + + Parameters + ========== + + error_when_incomplete : bool, optional + Raise an error when not all eigenvalues are computed. This is + caused by ``roots`` not returning a full list of eigenvalues. + + iszerofunc : function, optional + Specifies a zero testing function to be used in ``rref``. + + Default value is ``_iszero``, which uses SymPy's naive and fast + default assumption handler. + + It can also accept any user-specified zero testing function, if it + is formatted as a function which accepts a single symbolic argument + and returns ``True`` if it is tested as zero and ``False`` if it + is tested as non-zero, and ``None`` if it is undecidable. + + simplify : bool or function, optional + If ``True``, ``as_content_primitive()`` will be used to tidy up + normalization artifacts. + + It will also be used by the ``nullspace`` routine. + + chop : bool or positive number, optional + If the matrix contains any Floats, they will be changed to Rationals + for computation purposes, but the answers will be returned after + being evaluated with evalf. The ``chop`` flag is passed to ``evalf``. + When ``chop=True`` a default precision will be used; a number will + be interpreted as the desired level of precision. + + Returns + ======= + + ret : [(eigenval, multiplicity, eigenspace), ...] + A ragged list containing tuples of data obtained by ``eigenvals`` + and ``nullspace``. + + ``eigenspace`` is a list containing the ``eigenvector`` for each + eigenvalue. + + ``eigenvector`` is a vector in the form of a ``Matrix``. e.g. + a vector of length 3 is returned as ``Matrix([a_1, a_2, a_3])``. + + Raises + ====== + + NotImplementedError + If failed to compute nullspace. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix(3, 3, [0, 1, 1, 1, 0, 0, 1, 1, 1]) + >>> M.eigenvects() + [(-1, 1, [Matrix([ + [-1], + [ 1], + [ 0]])]), (0, 1, [Matrix([ + [ 0], + [-1], + [ 1]])]), (2, 1, [Matrix([ + [2/3], + [1/3], + [ 1]])])] + + See Also + ======== + + eigenvals + MatrixBase.nullspace + """ + simplify = flags.get('simplify', True) + primitive = flags.get('simplify', False) + flags.pop('simplify', None) # remove this if it's there + flags.pop('multiple', None) # remove this if it's there + + if not isinstance(simplify, FunctionType): + simpfunc = _simplify if simplify else lambda x: x + + has_floats = M.has(Float) + if has_floats: + if all(x.is_number for x in M): + return _eigenvects_mpmath(M) + from sympy.simplify import nsimplify + M = M.applyfunc(lambda x: nsimplify(x, rational=True)) + + ret = _eigenvects_DOM(M) + if ret is None: + ret = _eigenvects_sympy(M, iszerofunc, simplify=simplify, **flags) + + if primitive: + # if the primitive flag is set, get rid of any common + # integer denominators + def denom_clean(l): + return [(v / gcd(list(v))).applyfunc(simpfunc) for v in l] + + ret = [(val, mult, denom_clean(es)) for val, mult, es in ret] + + if has_floats: + # if we had floats to start with, turn the eigenvectors to floats + ret = [(val.evalf(chop=chop), mult, [v.evalf(chop=chop) for v in es]) + for val, mult, es in ret] + + return ret + + +def _is_diagonalizable_with_eigen(M, reals_only=False): + """See _is_diagonalizable. This function returns the bool along with the + eigenvectors to avoid calculating them again in functions like + ``diagonalize``.""" + + if not M.is_square: + return False, [] + + eigenvecs = M.eigenvects(simplify=True) + + for val, mult, basis in eigenvecs: + if reals_only and not val.is_real: # if we have a complex eigenvalue + return False, eigenvecs + + if mult != len(basis): # if the geometric multiplicity doesn't equal the algebraic + return False, eigenvecs + + return True, eigenvecs + +def _is_diagonalizable(M, reals_only=False, **kwargs): + """Returns ``True`` if a matrix is diagonalizable. + + Parameters + ========== + + reals_only : bool, optional + If ``True``, it tests whether the matrix can be diagonalized + to contain only real numbers on the diagonal. + + + If ``False``, it tests whether the matrix can be diagonalized + at all, even with numbers that may not be real. + + Examples + ======== + + Example of a diagonalizable matrix: + + >>> from sympy import Matrix + >>> M = Matrix([[1, 2, 0], [0, 3, 0], [2, -4, 2]]) + >>> M.is_diagonalizable() + True + + Example of a non-diagonalizable matrix: + + >>> M = Matrix([[0, 1], [0, 0]]) + >>> M.is_diagonalizable() + False + + Example of a matrix that is diagonalized in terms of non-real entries: + + >>> M = Matrix([[0, 1], [-1, 0]]) + >>> M.is_diagonalizable(reals_only=False) + True + >>> M.is_diagonalizable(reals_only=True) + False + + See Also + ======== + + sympy.matrices.matrixbase.MatrixBase.is_diagonal + diagonalize + """ + if not M.is_square: + return False + + if all(e.is_real for e in M) and M.is_symmetric(): + return True + + if all(e.is_complex for e in M) and M.is_hermitian: + return True + + return _is_diagonalizable_with_eigen(M, reals_only=reals_only)[0] + + +#G&VL, Matrix Computations, Algo 5.4.2 +def _householder_vector(x): + if not x.cols == 1: + raise ValueError("Input must be a column matrix") + v = x.copy() + v_plus = x.copy() + v_minus = x.copy() + q = x[0, 0] / abs(x[0, 0]) + norm_x = x.norm() + v_plus[0, 0] = x[0, 0] + q * norm_x + v_minus[0, 0] = x[0, 0] - q * norm_x + if x[1:, 0].norm() == 0: + bet = 0 + v[0, 0] = 1 + else: + if v_plus.norm() <= v_minus.norm(): + v = v_plus + else: + v = v_minus + v = v / v[0] + bet = 2 / (v.norm() ** 2) + return v, bet + + +def _bidiagonal_decmp_hholder(M): + m = M.rows + n = M.cols + A = M.as_mutable() + U, V = A.eye(m), A.eye(n) + for i in range(min(m, n)): + v, bet = _householder_vector(A[i:, i]) + hh_mat = A.eye(m - i) - bet * v * v.H + A[i:, i:] = hh_mat * A[i:, i:] + temp = A.eye(m) + temp[i:, i:] = hh_mat + U = U * temp + if i + 1 <= n - 2: + v, bet = _householder_vector(A[i, i+1:].T) + hh_mat = A.eye(n - i - 1) - bet * v * v.H + A[i:, i+1:] = A[i:, i+1:] * hh_mat + temp = A.eye(n) + temp[i+1:, i+1:] = hh_mat + V = temp * V + return U, A, V + + +def _eval_bidiag_hholder(M): + m = M.rows + n = M.cols + A = M.as_mutable() + for i in range(min(m, n)): + v, bet = _householder_vector(A[i:, i]) + hh_mat = A.eye(m-i) - bet * v * v.H + A[i:, i:] = hh_mat * A[i:, i:] + if i + 1 <= n - 2: + v, bet = _householder_vector(A[i, i+1:].T) + hh_mat = A.eye(n - i - 1) - bet * v * v.H + A[i:, i+1:] = A[i:, i+1:] * hh_mat + return A + + +def _bidiagonal_decomposition(M, upper=True): + """ + Returns $(U,B,V.H)$ for + + $$A = UBV^{H}$$ + + where $A$ is the input matrix, and $B$ is its Bidiagonalized form + + Note: Bidiagonal Computation can hang for symbolic matrices. + + Parameters + ========== + + upper : bool. Whether to do upper bidiagnalization or lower. + True for upper and False for lower. + + References + ========== + + .. [1] Algorithm 5.4.2, Matrix computations by Golub and Van Loan, 4th edition + .. [2] Complex Matrix Bidiagonalization, https://github.com/vslobody/Householder-Bidiagonalization + + """ + + if not isinstance(upper, bool): + raise ValueError("upper must be a boolean") + + if upper: + return _bidiagonal_decmp_hholder(M) + + X = _bidiagonal_decmp_hholder(M.H) + return X[2].H, X[1].H, X[0].H + + +def _bidiagonalize(M, upper=True): + """ + Returns $B$, the Bidiagonalized form of the input matrix. + + Note: Bidiagonal Computation can hang for symbolic matrices. + + Parameters + ========== + + upper : bool. Whether to do upper bidiagnalization or lower. + True for upper and False for lower. + + References + ========== + + .. [1] Algorithm 5.4.2, Matrix computations by Golub and Van Loan, 4th edition + .. [2] Complex Matrix Bidiagonalization : https://github.com/vslobody/Householder-Bidiagonalization + + """ + + if not isinstance(upper, bool): + raise ValueError("upper must be a boolean") + + if upper: + return _eval_bidiag_hholder(M) + return _eval_bidiag_hholder(M.H).H + + +def _diagonalize(M, reals_only=False, sort=False, normalize=False): + """ + Return (P, D), where D is diagonal and + + D = P^-1 * M * P + + where M is current matrix. + + Parameters + ========== + + reals_only : bool. Whether to throw an error if complex numbers are need + to diagonalize. (Default: False) + + sort : bool. Sort the eigenvalues along the diagonal. (Default: False) + + normalize : bool. If True, normalize the columns of P. (Default: False) + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix(3, 3, [1, 2, 0, 0, 3, 0, 2, -4, 2]) + >>> M + Matrix([ + [1, 2, 0], + [0, 3, 0], + [2, -4, 2]]) + >>> (P, D) = M.diagonalize() + >>> D + Matrix([ + [1, 0, 0], + [0, 2, 0], + [0, 0, 3]]) + >>> P + Matrix([ + [-1, 0, -1], + [ 0, 0, -1], + [ 2, 1, 2]]) + >>> P.inv() * M * P + Matrix([ + [1, 0, 0], + [0, 2, 0], + [0, 0, 3]]) + + See Also + ======== + + sympy.matrices.matrixbase.MatrixBase.is_diagonal + is_diagonalizable + """ + + if not M.is_square: + raise NonSquareMatrixError() + + is_diagonalizable, eigenvecs = _is_diagonalizable_with_eigen(M, + reals_only=reals_only) + + if not is_diagonalizable: + raise MatrixError("Matrix is not diagonalizable") + + if sort: + eigenvecs = sorted(eigenvecs, key=default_sort_key) + + p_cols, diag = [], [] + + for val, mult, basis in eigenvecs: + diag += [val] * mult + p_cols += basis + + if normalize: + p_cols = [v / v.norm() for v in p_cols] + + return M.hstack(*p_cols), M.diag(*diag) + + +def _fuzzy_positive_definite(M): + positive_diagonals = M._has_positive_diagonals() + if positive_diagonals is False: + return False + + if positive_diagonals and M.is_strongly_diagonally_dominant: + return True + + return None + + +def _fuzzy_positive_semidefinite(M): + nonnegative_diagonals = M._has_nonnegative_diagonals() + if nonnegative_diagonals is False: + return False + + if nonnegative_diagonals and M.is_weakly_diagonally_dominant: + return True + + return None + + +def _is_positive_definite(M): + if not M.is_hermitian: + if not M.is_square: + return False + M = M + M.H + + fuzzy = _fuzzy_positive_definite(M) + if fuzzy is not None: + return fuzzy + + return _is_positive_definite_GE(M) + + +def _is_positive_semidefinite(M): + if not M.is_hermitian: + if not M.is_square: + return False + M = M + M.H + + fuzzy = _fuzzy_positive_semidefinite(M) + if fuzzy is not None: + return fuzzy + + return _is_positive_semidefinite_cholesky(M) + + +def _is_negative_definite(M): + return _is_positive_definite(-M) + + +def _is_negative_semidefinite(M): + return _is_positive_semidefinite(-M) + + +def _is_indefinite(M): + if M.is_hermitian: + eigen = M.eigenvals() + args1 = [x.is_positive for x in eigen.keys()] + any_positive = fuzzy_or(args1) + args2 = [x.is_negative for x in eigen.keys()] + any_negative = fuzzy_or(args2) + + return fuzzy_and([any_positive, any_negative]) + + elif M.is_square: + return (M + M.H).is_indefinite + + return False + + +def _is_positive_definite_GE(M): + """A division-free gaussian elimination method for testing + positive-definiteness.""" + M = M.as_mutable() + size = M.rows + + for i in range(size): + is_positive = M[i, i].is_positive + if is_positive is not True: + return is_positive + for j in range(i+1, size): + M[j, i+1:] = M[i, i] * M[j, i+1:] - M[j, i] * M[i, i+1:] + return True + + +def _is_positive_semidefinite_cholesky(M): + """Uses Cholesky factorization with complete pivoting + + References + ========== + + .. [1] http://eprints.ma.man.ac.uk/1199/1/covered/MIMS_ep2008_116.pdf + + .. [2] https://www.value-at-risk.net/cholesky-factorization/ + """ + M = M.as_mutable() + for k in range(M.rows): + diags = [M[i, i] for i in range(k, M.rows)] + pivot, pivot_val, nonzero, _ = _find_reasonable_pivot(diags) + + if nonzero: + return None + + if pivot is None: + for i in range(k+1, M.rows): + for j in range(k, M.cols): + iszero = M[i, j].is_zero + if iszero is None: + return None + elif iszero is False: + return False + return True + + if M[k, k].is_negative or pivot_val.is_negative: + return False + elif not (M[k, k].is_nonnegative and pivot_val.is_nonnegative): + return None + + if pivot > 0: + M.col_swap(k, k+pivot) + M.row_swap(k, k+pivot) + + M[k, k] = sqrt(M[k, k]) + M[k, k+1:] /= M[k, k] + M[k+1:, k+1:] -= M[k, k+1:].H * M[k, k+1:] + + return M[-1, -1].is_nonnegative + + +_doc_positive_definite = \ + r"""Finds out the definiteness of a matrix. + + Explanation + =========== + + A square real matrix $A$ is: + + - A positive definite matrix if $x^T A x > 0$ + for all non-zero real vectors $x$. + - A positive semidefinite matrix if $x^T A x \geq 0$ + for all non-zero real vectors $x$. + - A negative definite matrix if $x^T A x < 0$ + for all non-zero real vectors $x$. + - A negative semidefinite matrix if $x^T A x \leq 0$ + for all non-zero real vectors $x$. + - An indefinite matrix if there exists non-zero real vectors + $x, y$ with $x^T A x > 0 > y^T A y$. + + A square complex matrix $A$ is: + + - A positive definite matrix if $\text{re}(x^H A x) > 0$ + for all non-zero complex vectors $x$. + - A positive semidefinite matrix if $\text{re}(x^H A x) \geq 0$ + for all non-zero complex vectors $x$. + - A negative definite matrix if $\text{re}(x^H A x) < 0$ + for all non-zero complex vectors $x$. + - A negative semidefinite matrix if $\text{re}(x^H A x) \leq 0$ + for all non-zero complex vectors $x$. + - An indefinite matrix if there exists non-zero complex vectors + $x, y$ with $\text{re}(x^H A x) > 0 > \text{re}(y^H A y)$. + + A matrix need not be symmetric or hermitian to be positive definite. + + - A real non-symmetric matrix is positive definite if and only if + $\frac{A + A^T}{2}$ is positive definite. + - A complex non-hermitian matrix is positive definite if and only if + $\frac{A + A^H}{2}$ is positive definite. + + And this extension can apply for all the definitions above. + + However, for complex cases, you can restrict the definition of + $\text{re}(x^H A x) > 0$ to $x^H A x > 0$ and require the matrix + to be hermitian. + But we do not present this restriction for computation because you + can check ``M.is_hermitian`` independently with this and use + the same procedure. + + Examples + ======== + + An example of symmetric positive definite matrix: + + .. plot:: + :context: reset + :format: doctest + :include-source: True + + >>> from sympy import Matrix, symbols + >>> from sympy.plotting import plot3d + >>> a, b = symbols('a b') + >>> x = Matrix([a, b]) + + >>> A = Matrix([[1, 0], [0, 1]]) + >>> A.is_positive_definite + True + >>> A.is_positive_semidefinite + True + + >>> p = plot3d((x.T*A*x)[0, 0], (a, -1, 1), (b, -1, 1)) + + An example of symmetric positive semidefinite matrix: + + .. plot:: + :context: close-figs + :format: doctest + :include-source: True + + >>> A = Matrix([[1, -1], [-1, 1]]) + >>> A.is_positive_definite + False + >>> A.is_positive_semidefinite + True + + >>> p = plot3d((x.T*A*x)[0, 0], (a, -1, 1), (b, -1, 1)) + + An example of symmetric negative definite matrix: + + .. plot:: + :context: close-figs + :format: doctest + :include-source: True + + >>> A = Matrix([[-1, 0], [0, -1]]) + >>> A.is_negative_definite + True + >>> A.is_negative_semidefinite + True + >>> A.is_indefinite + False + + >>> p = plot3d((x.T*A*x)[0, 0], (a, -1, 1), (b, -1, 1)) + + An example of symmetric indefinite matrix: + + .. plot:: + :context: close-figs + :format: doctest + :include-source: True + + >>> A = Matrix([[1, 2], [2, -1]]) + >>> A.is_indefinite + True + + >>> p = plot3d((x.T*A*x)[0, 0], (a, -1, 1), (b, -1, 1)) + + An example of non-symmetric positive definite matrix. + + .. plot:: + :context: close-figs + :format: doctest + :include-source: True + + >>> A = Matrix([[1, 2], [-2, 1]]) + >>> A.is_positive_definite + True + >>> A.is_positive_semidefinite + True + + >>> p = plot3d((x.T*A*x)[0, 0], (a, -1, 1), (b, -1, 1)) + + Notes + ===== + + Although some people trivialize the definition of positive definite + matrices only for symmetric or hermitian matrices, this restriction + is not correct because it does not classify all instances of + positive definite matrices from the definition $x^T A x > 0$ or + $\text{re}(x^H A x) > 0$. + + For instance, ``Matrix([[1, 2], [-2, 1]])`` presented in + the example above is an example of real positive definite matrix + that is not symmetric. + + However, since the following formula holds true; + + .. math:: + \text{re}(x^H A x) > 0 \iff + \text{re}(x^H \frac{A + A^H}{2} x) > 0 + + We can classify all positive definite matrices that may or may not + be symmetric or hermitian by transforming the matrix to + $\frac{A + A^T}{2}$ or $\frac{A + A^H}{2}$ + (which is guaranteed to be always real symmetric or complex + hermitian) and we can defer most of the studies to symmetric or + hermitian positive definite matrices. + + But it is a different problem for the existence of Cholesky + decomposition. Because even though a non symmetric or a non + hermitian matrix can be positive definite, Cholesky or LDL + decomposition does not exist because the decompositions require the + matrix to be symmetric or hermitian. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Definiteness_of_a_matrix#Eigenvalues + + .. [2] https://mathworld.wolfram.com/PositiveDefiniteMatrix.html + + .. [3] Johnson, C. R. "Positive Definite Matrices." Amer. + Math. Monthly 77, 259-264 1970. + """ + +_is_positive_definite.__doc__ = _doc_positive_definite +_is_positive_semidefinite.__doc__ = _doc_positive_definite +_is_negative_definite.__doc__ = _doc_positive_definite +_is_negative_semidefinite.__doc__ = _doc_positive_definite +_is_indefinite.__doc__ = _doc_positive_definite + + +def _jordan_form(M, calc_transform=True, *, chop=False): + """Return $(P, J)$ where $J$ is a Jordan block + matrix and $P$ is a matrix such that $M = P J P^{-1}$ + + Parameters + ========== + + calc_transform : bool + If ``False``, then only $J$ is returned. + + chop : bool + All matrices are converted to exact types when computing + eigenvalues and eigenvectors. As a result, there may be + approximation errors. If ``chop==True``, these errors + will be truncated. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[ 6, 5, -2, -3], [-3, -1, 3, 3], [ 2, 1, -2, -3], [-1, 1, 5, 5]]) + >>> P, J = M.jordan_form() + >>> J + Matrix([ + [2, 1, 0, 0], + [0, 2, 0, 0], + [0, 0, 2, 1], + [0, 0, 0, 2]]) + + See Also + ======== + + jordan_block + """ + + if not M.is_square: + raise NonSquareMatrixError("Only square matrices have Jordan forms") + + mat = M + has_floats = M.has(Float) + + if has_floats: + try: + max_prec = max(term._prec for term in M.values() if isinstance(term, Float)) + except ValueError: + # if no term in the matrix is explicitly a Float calling max() + # will throw a error so setting max_prec to default value of 53 + max_prec = 53 + + # setting minimum max_dps to 15 to prevent loss of precision in + # matrix containing non evaluated expressions + max_dps = max(prec_to_dps(max_prec), 15) + + def restore_floats(*args): + """If ``has_floats`` is `True`, cast all ``args`` as + matrices of floats.""" + + if has_floats: + args = [m.evalf(n=max_dps, chop=chop) for m in args] + if len(args) == 1: + return args[0] + + return args + + # cache calculations for some speedup + mat_cache = {} + + def eig_mat(val, pow): + """Cache computations of ``(M - val*I)**pow`` for quick + retrieval""" + + if (val, pow) in mat_cache: + return mat_cache[(val, pow)] + + if (val, pow - 1) in mat_cache: + mat_cache[(val, pow)] = mat_cache[(val, pow - 1)].multiply( + mat_cache[(val, 1)], dotprodsimp=None) + else: + mat_cache[(val, pow)] = (mat - val*M.eye(M.rows)).pow(pow) + + return mat_cache[(val, pow)] + + # helper functions + def nullity_chain(val, algebraic_multiplicity): + """Calculate the sequence [0, nullity(E), nullity(E**2), ...] + until it is constant where ``E = M - val*I``""" + + # mat.rank() is faster than computing the null space, + # so use the rank-nullity theorem + cols = M.cols + ret = [0] + nullity = cols - eig_mat(val, 1).rank() + i = 2 + + while nullity != ret[-1]: + ret.append(nullity) + + if nullity == algebraic_multiplicity: + break + + nullity = cols - eig_mat(val, i).rank() + i += 1 + + # Due to issues like #7146 and #15872, SymPy sometimes + # gives the wrong rank. In this case, raise an error + # instead of returning an incorrect matrix + if nullity < ret[-1] or nullity > algebraic_multiplicity: + raise MatrixError( + "SymPy had encountered an inconsistent " + "result while computing Jordan block: " + "{}".format(M)) + + return ret + + def blocks_from_nullity_chain(d): + """Return a list of the size of each Jordan block. + If d_n is the nullity of E**n, then the number + of Jordan blocks of size n is + + 2*d_n - d_(n-1) - d_(n+1)""" + + # d[0] is always the number of columns, so skip past it + mid = [2*d[n] - d[n - 1] - d[n + 1] for n in range(1, len(d) - 1)] + # d is assumed to plateau with "d[ len(d) ] == d[-1]", so + # 2*d_n - d_(n-1) - d_(n+1) == d_n - d_(n-1) + end = [d[-1] - d[-2]] if len(d) > 1 else [d[0]] + + return mid + end + + def pick_vec(small_basis, big_basis): + """Picks a vector from big_basis that isn't in + the subspace spanned by small_basis""" + + if len(small_basis) == 0: + return big_basis[0] + + for v in big_basis: + _, pivots = M.hstack(*(small_basis + [v])).echelon_form( + with_pivots=True) + + if pivots[-1] == len(small_basis): + return v + + # roots doesn't like Floats, so replace them with Rationals + if has_floats: + from sympy.simplify import nsimplify + mat = mat.applyfunc(lambda x: nsimplify(x, rational=True)) + + # first calculate the jordan block structure + eigs = mat.eigenvals() + + # Make sure that we have all roots in radical form + for x in eigs: + if x.has(CRootOf): + raise MatrixError( + "Jordan normal form is not implemented if the matrix have " + "eigenvalues in CRootOf form") + + # most matrices have distinct eigenvalues + # and so are diagonalizable. In this case, don't + # do extra work! + if len(eigs.keys()) == mat.cols: + blocks = sorted(eigs.keys(), key=default_sort_key) + jordan_mat = mat.diag(*blocks) + + if not calc_transform: + return restore_floats(jordan_mat) + + jordan_basis = [eig_mat(eig, 1).nullspace()[0] + for eig in blocks] + basis_mat = mat.hstack(*jordan_basis) + + return restore_floats(basis_mat, jordan_mat) + + block_structure = [] + + for eig in sorted(eigs.keys(), key=default_sort_key): + algebraic_multiplicity = eigs[eig] + chain = nullity_chain(eig, algebraic_multiplicity) + block_sizes = blocks_from_nullity_chain(chain) + + # if block_sizes = = [a, b, c, ...], then the number of + # Jordan blocks of size 1 is a, of size 2 is b, etc. + # create an array that has (eig, block_size) with one + # entry for each block + size_nums = [(i+1, num) for i, num in enumerate(block_sizes)] + + # we expect larger Jordan blocks to come earlier + size_nums.reverse() + + block_structure.extend( + [(eig, size) for size, num in size_nums for _ in range(num)]) + + jordan_form_size = sum(size for eig, size in block_structure) + + if jordan_form_size != M.rows: + raise MatrixError( + "SymPy had encountered an inconsistent result while " + "computing Jordan block. : {}".format(M)) + + blocks = (mat.jordan_block(size=size, eigenvalue=eig) for eig, size in block_structure) + jordan_mat = mat.diag(*blocks) + + if not calc_transform: + return restore_floats(jordan_mat) + + # For each generalized eigenspace, calculate a basis. + # We start by looking for a vector in null( (A - eig*I)**n ) + # which isn't in null( (A - eig*I)**(n-1) ) where n is + # the size of the Jordan block + # + # Ideally we'd just loop through block_structure and + # compute each generalized eigenspace. However, this + # causes a lot of unneeded computation. Instead, we + # go through the eigenvalues separately, since we know + # their generalized eigenspaces must have bases that + # are linearly independent. + jordan_basis = [] + + for eig in sorted(eigs.keys(), key=default_sort_key): + eig_basis = [] + + for block_eig, size in block_structure: + if block_eig != eig: + continue + + null_big = (eig_mat(eig, size)).nullspace() + null_small = (eig_mat(eig, size - 1)).nullspace() + + # we want to pick something that is in the big basis + # and not the small, but also something that is independent + # of any other generalized eigenvectors from a different + # generalized eigenspace sharing the same eigenvalue. + vec = pick_vec(null_small + eig_basis, null_big) + new_vecs = [eig_mat(eig, i).multiply(vec, dotprodsimp=None) + for i in range(size)] + + eig_basis.extend(new_vecs) + jordan_basis.extend(reversed(new_vecs)) + + basis_mat = mat.hstack(*jordan_basis) + + return restore_floats(basis_mat, jordan_mat) + + +def _left_eigenvects(M, **flags): + """Returns left eigenvectors and eigenvalues. + + This function returns the list of triples (eigenval, multiplicity, + basis) for the left eigenvectors. Options are the same as for + eigenvects(), i.e. the ``**flags`` arguments gets passed directly to + eigenvects(). + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[0, 1, 1], [1, 0, 0], [1, 1, 1]]) + >>> M.eigenvects() + [(-1, 1, [Matrix([ + [-1], + [ 1], + [ 0]])]), (0, 1, [Matrix([ + [ 0], + [-1], + [ 1]])]), (2, 1, [Matrix([ + [2/3], + [1/3], + [ 1]])])] + >>> M.left_eigenvects() + [(-1, 1, [Matrix([[-2, 1, 1]])]), (0, 1, [Matrix([[-1, -1, 1]])]), (2, + 1, [Matrix([[1, 1, 1]])])] + + """ + + eigs = M.transpose().eigenvects(**flags) + + return [(val, mult, [l.transpose() for l in basis]) for val, mult, basis in eigs] + + +def _singular_values(M): + """Compute the singular values of a Matrix + + Examples + ======== + + >>> from sympy import Matrix, Symbol + >>> x = Symbol('x', real=True) + >>> M = Matrix([[0, 1, 0], [0, x, 0], [-1, 0, 0]]) + >>> M.singular_values() + [sqrt(x**2 + 1), 1, 0] + + See Also + ======== + + condition_number + """ + + if M.rows >= M.cols: + valmultpairs = M.H.multiply(M).eigenvals() + else: + valmultpairs = M.multiply(M.H).eigenvals() + + # Expands result from eigenvals into a simple list + vals = [] + + for k, v in valmultpairs.items(): + vals += [sqrt(k)] * v # dangerous! same k in several spots! + + # Pad with zeros if singular values are computed in reverse way, + # to give consistent format. + if len(vals) < M.cols: + vals += [M.zero] * (M.cols - len(vals)) + + # sort them in descending order + vals.sort(reverse=True, key=default_sort_key) + + return vals diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/permutation.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/permutation.py new file mode 100644 index 0000000000000000000000000000000000000000..5634fa941a53d8890583fe61bb29bc34f4e6000d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/permutation.py @@ -0,0 +1,303 @@ +from sympy.core import S +from sympy.core.sympify import _sympify +from sympy.functions import KroneckerDelta + +from .matexpr import MatrixExpr +from .special import ZeroMatrix, Identity, OneMatrix + + +class PermutationMatrix(MatrixExpr): + """A Permutation Matrix + + Parameters + ========== + + perm : Permutation + The permutation the matrix uses. + + The size of the permutation determines the matrix size. + + See the documentation of + :class:`sympy.combinatorics.permutations.Permutation` for + the further information of how to create a permutation object. + + Examples + ======== + + >>> from sympy import Matrix, PermutationMatrix + >>> from sympy.combinatorics import Permutation + + Creating a permutation matrix: + + >>> p = Permutation(1, 2, 0) + >>> P = PermutationMatrix(p) + >>> P = P.as_explicit() + >>> P + Matrix([ + [0, 1, 0], + [0, 0, 1], + [1, 0, 0]]) + + Permuting a matrix row and column: + + >>> M = Matrix([0, 1, 2]) + >>> Matrix(P*M) + Matrix([ + [1], + [2], + [0]]) + + >>> Matrix(M.T*P) + Matrix([[2, 0, 1]]) + + See Also + ======== + + sympy.combinatorics.permutations.Permutation + """ + + def __new__(cls, perm): + from sympy.combinatorics.permutations import Permutation + + perm = _sympify(perm) + if not isinstance(perm, Permutation): + raise ValueError( + "{} must be a SymPy Permutation instance.".format(perm)) + + return super().__new__(cls, perm) + + @property + def shape(self): + size = self.args[0].size + return (size, size) + + @property + def is_Identity(self): + return self.args[0].is_Identity + + def doit(self, **hints): + if self.is_Identity: + return Identity(self.rows) + return self + + def _entry(self, i, j, **kwargs): + perm = self.args[0] + return KroneckerDelta(perm.apply(i), j) + + def _eval_power(self, exp): + return PermutationMatrix(self.args[0] ** exp).doit() + + def _eval_inverse(self): + return PermutationMatrix(self.args[0] ** -1) + + _eval_transpose = _eval_adjoint = _eval_inverse + + def _eval_determinant(self): + sign = self.args[0].signature() + if sign == 1: + return S.One + elif sign == -1: + return S.NegativeOne + raise NotImplementedError + + def _eval_rewrite_as_BlockDiagMatrix(self, *args, **kwargs): + from sympy.combinatorics.permutations import Permutation + from .blockmatrix import BlockDiagMatrix + + perm = self.args[0] + full_cyclic_form = perm.full_cyclic_form + + cycles_picks = [] + + # Stage 1. Decompose the cycles into the blockable form. + a, b, c = 0, 0, 0 + flag = False + for cycle in full_cyclic_form: + l = len(cycle) + m = max(cycle) + + if not flag: + if m + 1 > a + l: + flag = True + temp = [cycle] + b = m + c = l + else: + cycles_picks.append([cycle]) + a += l + + else: + if m > b: + if m + 1 == a + c + l: + temp.append(cycle) + cycles_picks.append(temp) + flag = False + a = m+1 + else: + b = m + temp.append(cycle) + c += l + else: + if b + 1 == a + c + l: + temp.append(cycle) + cycles_picks.append(temp) + flag = False + a = b+1 + else: + temp.append(cycle) + c += l + + # Stage 2. Normalize each decomposed cycles and build matrix. + p = 0 + args = [] + for pick in cycles_picks: + new_cycles = [] + l = 0 + for cycle in pick: + new_cycle = [i - p for i in cycle] + new_cycles.append(new_cycle) + l += len(cycle) + p += l + perm = Permutation(new_cycles) + mat = PermutationMatrix(perm) + args.append(mat) + + return BlockDiagMatrix(*args) + + +class MatrixPermute(MatrixExpr): + r"""Symbolic representation for permuting matrix rows or columns. + + Parameters + ========== + + perm : Permutation, PermutationMatrix + The permutation to use for permuting the matrix. + The permutation can be resized to the suitable one, + + axis : 0 or 1 + The axis to permute alongside. + If `0`, it will permute the matrix rows. + If `1`, it will permute the matrix columns. + + Notes + ===== + + This follows the same notation used in + :meth:`sympy.matrices.matrixbase.MatrixBase.permute`. + + Examples + ======== + + >>> from sympy import Matrix, MatrixPermute + >>> from sympy.combinatorics import Permutation + + Permuting the matrix rows: + + >>> p = Permutation(1, 2, 0) + >>> A = Matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> B = MatrixPermute(A, p, axis=0) + >>> B.as_explicit() + Matrix([ + [4, 5, 6], + [7, 8, 9], + [1, 2, 3]]) + + Permuting the matrix columns: + + >>> B = MatrixPermute(A, p, axis=1) + >>> B.as_explicit() + Matrix([ + [2, 3, 1], + [5, 6, 4], + [8, 9, 7]]) + + See Also + ======== + + sympy.matrices.matrixbase.MatrixBase.permute + """ + def __new__(cls, mat, perm, axis=S.Zero): + from sympy.combinatorics.permutations import Permutation + + mat = _sympify(mat) + if not mat.is_Matrix: + raise ValueError( + "{} must be a SymPy matrix instance.".format(perm)) + + perm = _sympify(perm) + if isinstance(perm, PermutationMatrix): + perm = perm.args[0] + + if not isinstance(perm, Permutation): + raise ValueError( + "{} must be a SymPy Permutation or a PermutationMatrix " \ + "instance".format(perm)) + + axis = _sympify(axis) + if axis not in (0, 1): + raise ValueError("The axis must be 0 or 1.") + + mat_size = mat.shape[axis] + if mat_size != perm.size: + try: + perm = perm.resize(mat_size) + except ValueError: + raise ValueError( + "Size does not match between the permutation {} " + "and the matrix {} threaded over the axis {} " + "and cannot be converted." + .format(perm, mat, axis)) + + return super().__new__(cls, mat, perm, axis) + + def doit(self, deep=True, **hints): + mat, perm, axis = self.args + + if deep: + mat = mat.doit(deep=deep, **hints) + perm = perm.doit(deep=deep, **hints) + + if perm.is_Identity: + return mat + + if mat.is_Identity: + if axis is S.Zero: + return PermutationMatrix(perm) + elif axis is S.One: + return PermutationMatrix(perm**-1) + + if isinstance(mat, (ZeroMatrix, OneMatrix)): + return mat + + if isinstance(mat, MatrixPermute) and mat.args[2] == axis: + return MatrixPermute(mat.args[0], perm * mat.args[1], axis) + + return self + + @property + def shape(self): + return self.args[0].shape + + def _entry(self, i, j, **kwargs): + mat, perm, axis = self.args + + if axis == 0: + return mat[perm.apply(i), j] + elif axis == 1: + return mat[i, perm.apply(j)] + + def _eval_rewrite_as_MatMul(self, *args, **kwargs): + from .matmul import MatMul + + mat, perm, axis = self.args + + deep = kwargs.get("deep", True) + + if deep: + mat = mat.rewrite(MatMul) + + if axis == 0: + return MatMul(PermutationMatrix(perm), mat) + elif axis == 1: + return MatMul(mat, PermutationMatrix(perm**-1)) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/tests/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/tests/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63ffc43ba8c8dc2caa9a31d233a2156c8ad8a430 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/tests/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/tests/__pycache__/test_matmul.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/tests/__pycache__/test_matmul.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c96d6a8d32e51a198a1ce8546a8d649ff0c4f34e Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/expressions/tests/__pycache__/test_matmul.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/graph.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..4c6356db884cfcd3c759ada07ac559f43dbcbbcb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/graph.py @@ -0,0 +1,279 @@ +from sympy.utilities.iterables import \ + flatten, connected_components, strongly_connected_components +from .exceptions import NonSquareMatrixError + + +def _connected_components(M): + """Returns the list of connected vertices of the graph when + a square matrix is viewed as a weighted graph. + + Examples + ======== + + >>> from sympy import Matrix + >>> A = Matrix([ + ... [66, 0, 0, 68, 0, 0, 0, 0, 67], + ... [0, 55, 0, 0, 0, 0, 54, 53, 0], + ... [0, 0, 0, 0, 1, 2, 0, 0, 0], + ... [86, 0, 0, 88, 0, 0, 0, 0, 87], + ... [0, 0, 10, 0, 11, 12, 0, 0, 0], + ... [0, 0, 20, 0, 21, 22, 0, 0, 0], + ... [0, 45, 0, 0, 0, 0, 44, 43, 0], + ... [0, 35, 0, 0, 0, 0, 34, 33, 0], + ... [76, 0, 0, 78, 0, 0, 0, 0, 77]]) + >>> A.connected_components() + [[0, 3, 8], [1, 6, 7], [2, 4, 5]] + + Notes + ===== + + Even if any symbolic elements of the matrix can be indeterminate + to be zero mathematically, this only takes the account of the + structural aspect of the matrix, so they will considered to be + nonzero. + """ + if not M.is_square: + raise NonSquareMatrixError + + V = range(M.rows) + E = sorted(M.todok().keys()) + return connected_components((V, E)) + + +def _strongly_connected_components(M): + """Returns the list of strongly connected vertices of the graph when + a square matrix is viewed as a weighted graph. + + Examples + ======== + + >>> from sympy import Matrix + >>> A = Matrix([ + ... [44, 0, 0, 0, 43, 0, 45, 0, 0], + ... [0, 66, 62, 61, 0, 68, 0, 60, 67], + ... [0, 0, 22, 21, 0, 0, 0, 20, 0], + ... [0, 0, 12, 11, 0, 0, 0, 10, 0], + ... [34, 0, 0, 0, 33, 0, 35, 0, 0], + ... [0, 86, 82, 81, 0, 88, 0, 80, 87], + ... [54, 0, 0, 0, 53, 0, 55, 0, 0], + ... [0, 0, 2, 1, 0, 0, 0, 0, 0], + ... [0, 76, 72, 71, 0, 78, 0, 70, 77]]) + >>> A.strongly_connected_components() + [[0, 4, 6], [2, 3, 7], [1, 5, 8]] + """ + if not M.is_square: + raise NonSquareMatrixError + + # RepMatrix uses the more efficient DomainMatrix.scc() method + rep = getattr(M, '_rep', None) + if rep is not None: + return rep.scc() + + V = range(M.rows) + E = sorted(M.todok().keys()) + return strongly_connected_components((V, E)) + + +def _connected_components_decomposition(M): + """Decomposes a square matrix into block diagonal form only + using the permutations. + + Explanation + =========== + + The decomposition is in a form of $A = P^{-1} B P$ where $P$ is a + permutation matrix and $B$ is a block diagonal matrix. + + Returns + ======= + + P, B : PermutationMatrix, BlockDiagMatrix + *P* is a permutation matrix for the similarity transform + as in the explanation. And *B* is the block diagonal matrix of + the result of the permutation. + + If you would like to get the diagonal blocks from the + BlockDiagMatrix, see + :meth:`~sympy.matrices.expressions.blockmatrix.BlockDiagMatrix.get_diag_blocks`. + + Examples + ======== + + >>> from sympy import Matrix, pprint + >>> A = Matrix([ + ... [66, 0, 0, 68, 0, 0, 0, 0, 67], + ... [0, 55, 0, 0, 0, 0, 54, 53, 0], + ... [0, 0, 0, 0, 1, 2, 0, 0, 0], + ... [86, 0, 0, 88, 0, 0, 0, 0, 87], + ... [0, 0, 10, 0, 11, 12, 0, 0, 0], + ... [0, 0, 20, 0, 21, 22, 0, 0, 0], + ... [0, 45, 0, 0, 0, 0, 44, 43, 0], + ... [0, 35, 0, 0, 0, 0, 34, 33, 0], + ... [76, 0, 0, 78, 0, 0, 0, 0, 77]]) + + >>> P, B = A.connected_components_decomposition() + >>> pprint(P) + PermutationMatrix((1 3)(2 8 5 7 4 6)) + >>> pprint(B) + [[66 68 67] ] + [[ ] ] + [[86 88 87] 0 0 ] + [[ ] ] + [[76 78 77] ] + [ ] + [ [55 54 53] ] + [ [ ] ] + [ 0 [45 44 43] 0 ] + [ [ ] ] + [ [35 34 33] ] + [ ] + [ [0 1 2 ]] + [ [ ]] + [ 0 0 [10 11 12]] + [ [ ]] + [ [20 21 22]] + + >>> P = P.as_explicit() + >>> B = B.as_explicit() + >>> P.T*B*P == A + True + + Notes + ===== + + This problem corresponds to the finding of the connected components + of a graph, when a matrix is viewed as a weighted graph. + """ + from sympy.combinatorics.permutations import Permutation + from sympy.matrices.expressions.blockmatrix import BlockDiagMatrix + from sympy.matrices.expressions.permutation import PermutationMatrix + + iblocks = M.connected_components() + + p = Permutation(flatten(iblocks)) + P = PermutationMatrix(p) + + blocks = [] + for b in iblocks: + blocks.append(M[b, b]) + B = BlockDiagMatrix(*blocks) + return P, B + + +def _strongly_connected_components_decomposition(M, lower=True): + """Decomposes a square matrix into block triangular form only + using the permutations. + + Explanation + =========== + + The decomposition is in a form of $A = P^{-1} B P$ where $P$ is a + permutation matrix and $B$ is a block diagonal matrix. + + Parameters + ========== + + lower : bool + Makes $B$ lower block triangular when ``True``. + Otherwise, makes $B$ upper block triangular. + + Returns + ======= + + P, B : PermutationMatrix, BlockMatrix + *P* is a permutation matrix for the similarity transform + as in the explanation. And *B* is the block triangular matrix of + the result of the permutation. + + Examples + ======== + + >>> from sympy import Matrix, pprint + >>> A = Matrix([ + ... [44, 0, 0, 0, 43, 0, 45, 0, 0], + ... [0, 66, 62, 61, 0, 68, 0, 60, 67], + ... [0, 0, 22, 21, 0, 0, 0, 20, 0], + ... [0, 0, 12, 11, 0, 0, 0, 10, 0], + ... [34, 0, 0, 0, 33, 0, 35, 0, 0], + ... [0, 86, 82, 81, 0, 88, 0, 80, 87], + ... [54, 0, 0, 0, 53, 0, 55, 0, 0], + ... [0, 0, 2, 1, 0, 0, 0, 0, 0], + ... [0, 76, 72, 71, 0, 78, 0, 70, 77]]) + + A lower block triangular decomposition: + + >>> P, B = A.strongly_connected_components_decomposition() + >>> pprint(P) + PermutationMatrix((8)(1 4 3 2 6)(5 7)) + >>> pprint(B) + [[44 43 45] [0 0 0] [0 0 0] ] + [[ ] [ ] [ ] ] + [[34 33 35] [0 0 0] [0 0 0] ] + [[ ] [ ] [ ] ] + [[54 53 55] [0 0 0] [0 0 0] ] + [ ] + [ [0 0 0] [22 21 20] [0 0 0] ] + [ [ ] [ ] [ ] ] + [ [0 0 0] [12 11 10] [0 0 0] ] + [ [ ] [ ] [ ] ] + [ [0 0 0] [2 1 0 ] [0 0 0] ] + [ ] + [ [0 0 0] [62 61 60] [66 68 67]] + [ [ ] [ ] [ ]] + [ [0 0 0] [82 81 80] [86 88 87]] + [ [ ] [ ] [ ]] + [ [0 0 0] [72 71 70] [76 78 77]] + + >>> P = P.as_explicit() + >>> B = B.as_explicit() + >>> P.T * B * P == A + True + + An upper block triangular decomposition: + + >>> P, B = A.strongly_connected_components_decomposition(lower=False) + >>> pprint(P) + PermutationMatrix((0 1 5 7 4 3 2 8 6)) + >>> pprint(B) + [[66 68 67] [62 61 60] [0 0 0] ] + [[ ] [ ] [ ] ] + [[86 88 87] [82 81 80] [0 0 0] ] + [[ ] [ ] [ ] ] + [[76 78 77] [72 71 70] [0 0 0] ] + [ ] + [ [0 0 0] [22 21 20] [0 0 0] ] + [ [ ] [ ] [ ] ] + [ [0 0 0] [12 11 10] [0 0 0] ] + [ [ ] [ ] [ ] ] + [ [0 0 0] [2 1 0 ] [0 0 0] ] + [ ] + [ [0 0 0] [0 0 0] [44 43 45]] + [ [ ] [ ] [ ]] + [ [0 0 0] [0 0 0] [34 33 35]] + [ [ ] [ ] [ ]] + [ [0 0 0] [0 0 0] [54 53 55]] + + >>> P = P.as_explicit() + >>> B = B.as_explicit() + >>> P.T * B * P == A + True + """ + from sympy.combinatorics.permutations import Permutation + from sympy.matrices.expressions.blockmatrix import BlockMatrix + from sympy.matrices.expressions.permutation import PermutationMatrix + + iblocks = M.strongly_connected_components() + if not lower: + iblocks = list(reversed(iblocks)) + + p = Permutation(flatten(iblocks)) + P = PermutationMatrix(p) + + rows = [] + for a in iblocks: + cols = [] + for b in iblocks: + cols.append(M[a, b]) + rows.append(cols) + B = BlockMatrix(rows) + return P, B diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/immutable.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/immutable.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec2174bf1c785e1a4698e1b55078d300e62dafe --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/immutable.py @@ -0,0 +1,196 @@ +from mpmath.matrices.matrices import _matrix + +from sympy.core import Basic, Dict, Tuple +from sympy.core.numbers import Integer +from sympy.core.cache import cacheit +from sympy.core.sympify import _sympy_converter as sympify_converter, _sympify +from sympy.matrices.dense import DenseMatrix +from sympy.matrices.expressions import MatrixExpr +from sympy.matrices.matrixbase import MatrixBase +from sympy.matrices.repmatrix import RepMatrix +from sympy.matrices.sparse import SparseRepMatrix +from sympy.multipledispatch import dispatch + + +def sympify_matrix(arg): + return arg.as_immutable() + + +sympify_converter[MatrixBase] = sympify_matrix + + +def sympify_mpmath_matrix(arg): + mat = [_sympify(x) for x in arg] + return ImmutableDenseMatrix(arg.rows, arg.cols, mat) + + +sympify_converter[_matrix] = sympify_mpmath_matrix + + +class ImmutableRepMatrix(RepMatrix, MatrixExpr): # type: ignore + """Immutable matrix based on RepMatrix + + Uses DomainMAtrix as the internal representation. + """ + + # + # This is a subclass of RepMatrix that adds/overrides some methods to make + # the instances Basic and immutable. ImmutableRepMatrix is a superclass for + # both ImmutableDenseMatrix and ImmutableSparseMatrix. + # + + def __new__(cls, *args, **kwargs): + return cls._new(*args, **kwargs) + + __hash__ = MatrixExpr.__hash__ + + def copy(self): + return self + + @property + def cols(self): + return self._cols + + @property + def rows(self): + return self._rows + + @property + def shape(self): + return self._rows, self._cols + + def as_immutable(self): + return self + + def _entry(self, i, j, **kwargs): + return self[i, j] + + def __setitem__(self, *args): + raise TypeError("Cannot set values of {}".format(self.__class__)) + + def is_diagonalizable(self, reals_only=False, **kwargs): + return super().is_diagonalizable( + reals_only=reals_only, **kwargs) + + is_diagonalizable.__doc__ = SparseRepMatrix.is_diagonalizable.__doc__ + is_diagonalizable = cacheit(is_diagonalizable) + + def analytic_func(self, f, x): + return self.as_mutable().analytic_func(f, x).as_immutable() + + +class ImmutableDenseMatrix(DenseMatrix, ImmutableRepMatrix): # type: ignore + """Create an immutable version of a matrix. + + Examples + ======== + + >>> from sympy import eye, ImmutableMatrix + >>> ImmutableMatrix(eye(3)) + Matrix([ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1]]) + >>> _[0, 0] = 42 + Traceback (most recent call last): + ... + TypeError: Cannot set values of ImmutableDenseMatrix + """ + + # MatrixExpr is set as NotIterable, but we want explicit matrices to be + # iterable + _iterable = True + _class_priority = 8 + _op_priority = 10.001 + + @classmethod + def _new(cls, *args, **kwargs): + if len(args) == 1 and isinstance(args[0], ImmutableDenseMatrix): + return args[0] + if kwargs.get('copy', True) is False: + if len(args) != 3: + raise TypeError("'copy=False' requires a matrix be initialized as rows,cols,[list]") + rows, cols, flat_list = args + else: + rows, cols, flat_list = cls._handle_creation_inputs(*args, **kwargs) + flat_list = list(flat_list) # create a shallow copy + + rep = cls._flat_list_to_DomainMatrix(rows, cols, flat_list) + + return cls._fromrep(rep) + + @classmethod + def _fromrep(cls, rep): + rows, cols = rep.shape + flat_list = rep.to_sympy().to_list_flat() + obj = Basic.__new__(cls, + Integer(rows), + Integer(cols), + Tuple(*flat_list, sympify=False)) + obj._rows = rows + obj._cols = cols + obj._rep = rep + return obj + + +# make sure ImmutableDenseMatrix is aliased as ImmutableMatrix +ImmutableMatrix = ImmutableDenseMatrix + + +class ImmutableSparseMatrix(SparseRepMatrix, ImmutableRepMatrix): # type:ignore + """Create an immutable version of a sparse matrix. + + Examples + ======== + + >>> from sympy import eye, ImmutableSparseMatrix + >>> ImmutableSparseMatrix(1, 1, {}) + Matrix([[0]]) + >>> ImmutableSparseMatrix(eye(3)) + Matrix([ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1]]) + >>> _[0, 0] = 42 + Traceback (most recent call last): + ... + TypeError: Cannot set values of ImmutableSparseMatrix + >>> _.shape + (3, 3) + """ + is_Matrix = True + _class_priority = 9 + + @classmethod + def _new(cls, *args, **kwargs): + rows, cols, smat = cls._handle_creation_inputs(*args, **kwargs) + + rep = cls._smat_to_DomainMatrix(rows, cols, smat) + + return cls._fromrep(rep) + + @classmethod + def _fromrep(cls, rep): + rows, cols = rep.shape + smat = rep.to_sympy().to_dok() + obj = Basic.__new__(cls, Integer(rows), Integer(cols), Dict(smat)) + obj._rows = rows + obj._cols = cols + obj._rep = rep + return obj + + +@dispatch(ImmutableDenseMatrix, ImmutableDenseMatrix) +def _eval_is_eq(lhs, rhs): # noqa:F811 + """Helper method for Equality with matrices.sympy. + + Relational automatically converts matrices to ImmutableDenseMatrix + instances, so this method only applies here. Returns True if the + matrices are definitively the same, False if they are definitively + different, and None if undetermined (e.g. if they contain Symbols). + Returning None triggers default handling of Equalities. + + """ + if lhs.shape != rhs.shape: + return False + return (lhs - rhs).is_zero_matrix diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/repmatrix.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/repmatrix.py new file mode 100644 index 0000000000000000000000000000000000000000..e20781eefc600c7ea8b08f4051b76aef451b88a3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/repmatrix.py @@ -0,0 +1,1025 @@ +from collections import defaultdict + +from operator import index as index_ + +from sympy.core.expr import Expr +from sympy.core.kind import Kind, NumberKind, UndefinedKind +from sympy.core.numbers import Integer, Rational +from sympy.core.sympify import _sympify, SympifyError +from sympy.core.singleton import S +from sympy.polys.domains import ZZ, QQ, GF, EXRAW +from sympy.polys.matrices import DomainMatrix +from sympy.polys.matrices.exceptions import DMNonInvertibleMatrixError +from sympy.polys.polyerrors import CoercionFailed +from sympy.utilities.exceptions import sympy_deprecation_warning +from sympy.utilities.iterables import is_sequence +from sympy.utilities.misc import filldedent, as_int + +from .exceptions import ShapeError, NonSquareMatrixError, NonInvertibleMatrixError +from .matrixbase import classof, MatrixBase +from .kind import MatrixKind + + +class RepMatrix(MatrixBase): + """Matrix implementation based on DomainMatrix as an internal representation. + + The RepMatrix class is a superclass for Matrix, ImmutableMatrix, + SparseMatrix and ImmutableSparseMatrix which are the main usable matrix + classes in SymPy. Most methods on this class are simply forwarded to + DomainMatrix. + """ + + # + # MatrixBase is the common superclass for all of the usable explicit matrix + # classes in SymPy. The idea is that MatrixBase is an abstract class though + # and that subclasses will implement the lower-level methods. + # + # RepMatrix is a subclass of MatrixBase that uses DomainMatrix as an + # internal representation and delegates lower-level methods to + # DomainMatrix. All of SymPy's standard explicit matrix classes subclass + # RepMatrix and so use DomainMatrix internally. + # + # A RepMatrix uses an internal DomainMatrix with the domain set to ZZ, QQ + # or EXRAW. The EXRAW domain is equivalent to the previous implementation + # of Matrix that used Expr for the elements. The ZZ and QQ domains are used + # when applicable just because they are compatible with the previous + # implementation but are much more efficient. Other domains such as QQ[x] + # are not used because they differ from Expr in some way (e.g. automatic + # expansion of powers and products). + # + + _rep: DomainMatrix + + def __eq__(self, other): + # Skip sympify for mutable matrices... + if not isinstance(other, RepMatrix): + try: + other = _sympify(other) + except SympifyError: + return NotImplemented + if not isinstance(other, RepMatrix): + return NotImplemented + + return self._rep.unify_eq(other._rep) + + def to_DM(self, domain=None, **kwargs): + """Convert to a :class:`~.DomainMatrix`. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[1, 2], [3, 4]]) + >>> M.to_DM() + DomainMatrix({0: {0: 1, 1: 2}, 1: {0: 3, 1: 4}}, (2, 2), ZZ) + + The :meth:`DomainMatrix.to_Matrix` method can be used to convert back: + + >>> M.to_DM().to_Matrix() == M + True + + The domain can be given explicitly or otherwise it will be chosen by + :func:`construct_domain`. Any keyword arguments (besides ``domain``) + are passed to :func:`construct_domain`: + + >>> from sympy import QQ, symbols + >>> x = symbols('x') + >>> M = Matrix([[x, 1], [1, x]]) + >>> M + Matrix([ + [x, 1], + [1, x]]) + >>> M.to_DM().domain + ZZ[x] + >>> M.to_DM(field=True).domain + ZZ(x) + >>> M.to_DM(domain=QQ[x]).domain + QQ[x] + + See Also + ======== + + DomainMatrix + DomainMatrix.to_Matrix + DomainMatrix.convert_to + DomainMatrix.choose_domain + construct_domain + """ + if domain is not None: + if kwargs: + raise TypeError("Options cannot be used with domain parameter") + return self._rep.convert_to(domain) + + rep = self._rep + dom = rep.domain + + # If the internal DomainMatrix is already ZZ or QQ then we can maybe + # bypass calling construct_domain or performing any conversions. Some + # kwargs might affect this though e.g. field=True (not sure if there + # are others). + if not kwargs: + if dom.is_ZZ: + return rep.copy() + elif dom.is_QQ: + # All elements might be integers + try: + return rep.convert_to(ZZ) + except CoercionFailed: + pass + return rep.copy() + + # Let construct_domain choose a domain + rep_dom = rep.choose_domain(**kwargs) + + # XXX: There should be an option to construct_domain to choose EXRAW + # instead of EX. At least converting to EX does not initially trigger + # EX.simplify which is what we want here but should probably be + # considered a bug in EX. Perhaps also this could be handled in + # DomainMatrix.choose_domain rather than here... + if rep_dom.domain.is_EX: + rep_dom = rep_dom.convert_to(EXRAW) + + return rep_dom + + @classmethod + def _unify_element_sympy(cls, rep, element): + domain = rep.domain + element = _sympify(element) + + if domain != EXRAW: + # The domain can only be ZZ, QQ or EXRAW + if element.is_Integer: + new_domain = domain + elif element.is_Rational: + new_domain = QQ + else: + new_domain = EXRAW + + # XXX: This converts the domain for all elements in the matrix + # which can be slow. This happens e.g. if __setitem__ changes one + # element to something that does not fit in the domain + if new_domain != domain: + rep = rep.convert_to(new_domain) + domain = new_domain + + if domain != EXRAW: + element = new_domain.from_sympy(element) + + if domain == EXRAW and not isinstance(element, Expr): + sympy_deprecation_warning( + """ + non-Expr objects in a Matrix is deprecated. Matrix represents + a mathematical matrix. To represent a container of non-numeric + entities, Use a list of lists, TableForm, NumPy array, or some + other data structure instead. + """, + deprecated_since_version="1.9", + active_deprecations_target="deprecated-non-expr-in-matrix", + stacklevel=4, + ) + + return rep, element + + @classmethod + def _dod_to_DomainMatrix(cls, rows, cols, dod, types): + + if not all(issubclass(typ, Expr) for typ in types): + sympy_deprecation_warning( + """ + non-Expr objects in a Matrix is deprecated. Matrix represents + a mathematical matrix. To represent a container of non-numeric + entities, Use a list of lists, TableForm, NumPy array, or some + other data structure instead. + """, + deprecated_since_version="1.9", + active_deprecations_target="deprecated-non-expr-in-matrix", + stacklevel=6, + ) + + rep = DomainMatrix(dod, (rows, cols), EXRAW) + + if all(issubclass(typ, Rational) for typ in types): + if all(issubclass(typ, Integer) for typ in types): + rep = rep.convert_to(ZZ) + else: + rep = rep.convert_to(QQ) + + return rep + + @classmethod + def _flat_list_to_DomainMatrix(cls, rows, cols, flat_list): + + elements_dod = defaultdict(dict) + for n, element in enumerate(flat_list): + if element != 0: + i, j = divmod(n, cols) + elements_dod[i][j] = element + + types = set(map(type, flat_list)) + + rep = cls._dod_to_DomainMatrix(rows, cols, elements_dod, types) + return rep + + @classmethod + def _smat_to_DomainMatrix(cls, rows, cols, smat): + + elements_dod = defaultdict(dict) + for (i, j), element in smat.items(): + if element != 0: + elements_dod[i][j] = element + + types = set(map(type, smat.values())) + + rep = cls._dod_to_DomainMatrix(rows, cols, elements_dod, types) + return rep + + def flat(self): + return self._rep.to_sympy().to_list_flat() + + def _eval_tolist(self): + return self._rep.to_sympy().to_list() + + def _eval_todok(self): + return self._rep.to_sympy().to_dok() + + @classmethod + def _eval_from_dok(cls, rows, cols, dok): + return cls._fromrep(cls._smat_to_DomainMatrix(rows, cols, dok)) + + def _eval_values(self): + return list(self._eval_iter_values()) + + def _eval_iter_values(self): + rep = self._rep + K = rep.domain + values = rep.iter_values() + if not K.is_EXRAW: + values = map(K.to_sympy, values) + return values + + def _eval_iter_items(self): + rep = self._rep + K = rep.domain + to_sympy = K.to_sympy + items = rep.iter_items() + if not K.is_EXRAW: + items = ((i, to_sympy(v)) for i, v in items) + return items + + def copy(self): + return self._fromrep(self._rep.copy()) + + @property + def kind(self) -> MatrixKind: + domain = self._rep.domain + element_kind: Kind + if domain in (ZZ, QQ): + element_kind = NumberKind + elif domain == EXRAW: + kinds = {e.kind for e in self.values()} + if len(kinds) == 1: + [element_kind] = kinds + else: + element_kind = UndefinedKind + else: # pragma: no cover + raise RuntimeError("Domain should only be ZZ, QQ or EXRAW") + return MatrixKind(element_kind) + + def _eval_has(self, *patterns): + # if the matrix has any zeros, see if S.Zero + # has the pattern. If _smat is full length, + # the matrix has no zeros. + zhas = False + dok = self.todok() + if len(dok) != self.rows*self.cols: + zhas = S.Zero.has(*patterns) + return zhas or any(value.has(*patterns) for value in dok.values()) + + def _eval_is_Identity(self): + if not all(self[i, i] == 1 for i in range(self.rows)): + return False + return len(self.todok()) == self.rows + + def _eval_is_symmetric(self, simpfunc): + diff = (self - self.T).applyfunc(simpfunc) + return len(diff.values()) == 0 + + def _eval_transpose(self): + """Returns the transposed SparseMatrix of this SparseMatrix. + + Examples + ======== + + >>> from sympy import SparseMatrix + >>> a = SparseMatrix(((1, 2), (3, 4))) + >>> a + Matrix([ + [1, 2], + [3, 4]]) + >>> a.T + Matrix([ + [1, 3], + [2, 4]]) + """ + return self._fromrep(self._rep.transpose()) + + def _eval_col_join(self, other): + return self._fromrep(self._rep.vstack(other._rep)) + + def _eval_row_join(self, other): + return self._fromrep(self._rep.hstack(other._rep)) + + def _eval_extract(self, rowsList, colsList): + return self._fromrep(self._rep.extract(rowsList, colsList)) + + def __getitem__(self, key): + return _getitem_RepMatrix(self, key) + + @classmethod + def _eval_zeros(cls, rows, cols): + rep = DomainMatrix.zeros((rows, cols), ZZ) + return cls._fromrep(rep) + + @classmethod + def _eval_eye(cls, rows, cols): + rep = DomainMatrix.eye((rows, cols), ZZ) + return cls._fromrep(rep) + + def _eval_add(self, other): + return classof(self, other)._fromrep(self._rep + other._rep) + + def _eval_matrix_mul(self, other): + return classof(self, other)._fromrep(self._rep * other._rep) + + def _eval_matrix_mul_elementwise(self, other): + selfrep, otherrep = self._rep.unify(other._rep) + newrep = selfrep.mul_elementwise(otherrep) + return classof(self, other)._fromrep(newrep) + + def _eval_scalar_mul(self, other): + rep, other = self._unify_element_sympy(self._rep, other) + return self._fromrep(rep.scalarmul(other)) + + def _eval_scalar_rmul(self, other): + rep, other = self._unify_element_sympy(self._rep, other) + return self._fromrep(rep.rscalarmul(other)) + + def _eval_Abs(self): + return self._fromrep(self._rep.applyfunc(abs)) + + def _eval_conjugate(self): + rep = self._rep + domain = rep.domain + if domain in (ZZ, QQ): + return self.copy() + else: + return self._fromrep(rep.applyfunc(lambda e: e.conjugate())) + + def equals(self, other, failing_expression=False): + """Applies ``equals`` to corresponding elements of the matrices, + trying to prove that the elements are equivalent, returning True + if they are, False if any pair is not, and None (or the first + failing expression if failing_expression is True) if it cannot + be decided if the expressions are equivalent or not. This is, in + general, an expensive operation. + + Examples + ======== + + >>> from sympy import Matrix + >>> from sympy.abc import x + >>> A = Matrix([x*(x - 1), 0]) + >>> B = Matrix([x**2 - x, 0]) + >>> A == B + False + >>> A.simplify() == B.simplify() + True + >>> A.equals(B) + True + >>> A.equals(2) + False + + See Also + ======== + sympy.core.expr.Expr.equals + """ + if self.shape != getattr(other, 'shape', None): + return False + + rv = True + for i in range(self.rows): + for j in range(self.cols): + ans = self[i, j].equals(other[i, j], failing_expression) + if ans is False: + return False + elif ans is not True and rv is True: + rv = ans + return rv + + def inv_mod(M, m): + r""" + Returns the inverse of the integer matrix ``M`` modulo ``m``. + + Examples + ======== + + >>> from sympy import Matrix + >>> A = Matrix(2, 2, [1, 2, 3, 4]) + >>> A.inv_mod(5) + Matrix([ + [3, 1], + [4, 2]]) + >>> A.inv_mod(3) + Matrix([ + [1, 1], + [0, 1]]) + + """ + + if not M.is_square: + raise NonSquareMatrixError() + + try: + m = as_int(m) + except ValueError: + raise TypeError("inv_mod: modulus m must be an integer") + + K = GF(m, symmetric=False) + + try: + dM = M.to_DM(K) + except CoercionFailed: + raise ValueError("inv_mod: matrix entries must be integers") + + try: + dMi = dM.inv() + except DMNonInvertibleMatrixError as exc: + msg = f'Matrix is not invertible (mod {m})' + raise NonInvertibleMatrixError(msg) from exc + + return dMi.to_Matrix() + + def lll(self, delta=0.75): + """LLL-reduced basis for the rowspace of a matrix of integers. + + Performs the Lenstra–Lenstra–Lovász (LLL) basis reduction algorithm. + + The implementation is provided by :class:`~DomainMatrix`. See + :meth:`~DomainMatrix.lll` for more details. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[1, 0, 0, 0, -20160], + ... [0, 1, 0, 0, 33768], + ... [0, 0, 1, 0, 39578], + ... [0, 0, 0, 1, 47757]]) + >>> M.lll() + Matrix([ + [ 10, -3, -2, 8, -4], + [ 3, -9, 8, 1, -11], + [ -3, 13, -9, -3, -9], + [-12, -7, -11, 9, -1]]) + + See Also + ======== + + lll_transform + sympy.polys.matrices.domainmatrix.DomainMatrix.lll + """ + delta = QQ.from_sympy(_sympify(delta)) + dM = self._rep.convert_to(ZZ) + basis = dM.lll(delta=delta) + return self._fromrep(basis) + + def lll_transform(self, delta=0.75): + """LLL-reduced basis and transformation matrix. + + Performs the Lenstra–Lenstra–Lovász (LLL) basis reduction algorithm. + + The implementation is provided by :class:`~DomainMatrix`. See + :meth:`~DomainMatrix.lll_transform` for more details. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[1, 0, 0, 0, -20160], + ... [0, 1, 0, 0, 33768], + ... [0, 0, 1, 0, 39578], + ... [0, 0, 0, 1, 47757]]) + >>> B, T = M.lll_transform() + >>> B + Matrix([ + [ 10, -3, -2, 8, -4], + [ 3, -9, 8, 1, -11], + [ -3, 13, -9, -3, -9], + [-12, -7, -11, 9, -1]]) + >>> T + Matrix([ + [ 10, -3, -2, 8], + [ 3, -9, 8, 1], + [ -3, 13, -9, -3], + [-12, -7, -11, 9]]) + + The transformation matrix maps the original basis to the LLL-reduced + basis: + + >>> T * M == B + True + + See Also + ======== + + lll + sympy.polys.matrices.domainmatrix.DomainMatrix.lll_transform + """ + delta = QQ.from_sympy(_sympify(delta)) + dM = self._rep.convert_to(ZZ) + basis, transform = dM.lll_transform(delta=delta) + B = self._fromrep(basis) + T = self._fromrep(transform) + return B, T + + +class MutableRepMatrix(RepMatrix): + """Mutable matrix based on DomainMatrix as the internal representation""" + + # + # MutableRepMatrix is a subclass of RepMatrix that adds/overrides methods + # to make the instances mutable. MutableRepMatrix is a superclass for both + # MutableDenseMatrix and MutableSparseMatrix. + # + + is_zero = False + + def __new__(cls, *args, **kwargs): + return cls._new(*args, **kwargs) + + @classmethod + def _new(cls, *args, copy=True, **kwargs): + if copy is False: + # The input was rows, cols, [list]. + # It should be used directly without creating a copy. + if len(args) != 3: + raise TypeError("'copy=False' requires a matrix be initialized as rows,cols,[list]") + rows, cols, flat_list = args + else: + rows, cols, flat_list = cls._handle_creation_inputs(*args, **kwargs) + flat_list = list(flat_list) # create a shallow copy + + rep = cls._flat_list_to_DomainMatrix(rows, cols, flat_list) + + return cls._fromrep(rep) + + @classmethod + def _fromrep(cls, rep): + obj = super().__new__(cls) + obj.rows, obj.cols = rep.shape + obj._rep = rep + return obj + + def copy(self): + return self._fromrep(self._rep.copy()) + + def as_mutable(self): + return self.copy() + + def __setitem__(self, key, value): + """ + + Examples + ======== + + >>> from sympy import Matrix, I, zeros, ones + >>> m = Matrix(((1, 2+I), (3, 4))) + >>> m + Matrix([ + [1, 2 + I], + [3, 4]]) + >>> m[1, 0] = 9 + >>> m + Matrix([ + [1, 2 + I], + [9, 4]]) + >>> m[1, 0] = [[0, 1]] + + To replace row r you assign to position r*m where m + is the number of columns: + + >>> M = zeros(4) + >>> m = M.cols + >>> M[3*m] = ones(1, m)*2; M + Matrix([ + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0], + [2, 2, 2, 2]]) + + And to replace column c you can assign to position c: + + >>> M[2] = ones(m, 1)*4; M + Matrix([ + [0, 0, 4, 0], + [0, 0, 4, 0], + [0, 0, 4, 0], + [2, 2, 4, 2]]) + """ + rv = self._setitem(key, value) + if rv is not None: + i, j, value = rv + self._rep, value = self._unify_element_sympy(self._rep, value) + self._rep.rep.setitem(i, j, value) + + def _eval_col_del(self, col): + self._rep = DomainMatrix.hstack(self._rep[:,:col], self._rep[:,col+1:]) + self.cols -= 1 + + def _eval_row_del(self, row): + self._rep = DomainMatrix.vstack(self._rep[:row,:], self._rep[row+1:, :]) + self.rows -= 1 + + def _eval_col_insert(self, col, other): + other = self._new(other) + return self.hstack(self[:,:col], other, self[:,col:]) + + def _eval_row_insert(self, row, other): + other = self._new(other) + return self.vstack(self[:row,:], other, self[row:,:]) + + def col_op(self, j, f): + """In-place operation on col j using two-arg functor whose args are + interpreted as (self[i, j], i). + + Examples + ======== + + >>> from sympy import eye + >>> M = eye(3) + >>> M.col_op(1, lambda v, i: v + 2*M[i, 0]); M + Matrix([ + [1, 2, 0], + [0, 1, 0], + [0, 0, 1]]) + + See Also + ======== + col + row_op + """ + for i in range(self.rows): + self[i, j] = f(self[i, j], i) + + def col_swap(self, i, j): + """Swap the two given columns of the matrix in-place. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[1, 0], [1, 0]]) + >>> M + Matrix([ + [1, 0], + [1, 0]]) + >>> M.col_swap(0, 1) + >>> M + Matrix([ + [0, 1], + [0, 1]]) + + See Also + ======== + + col + row_swap + """ + for k in range(0, self.rows): + self[k, i], self[k, j] = self[k, j], self[k, i] + + def row_op(self, i, f): + """In-place operation on row ``i`` using two-arg functor whose args are + interpreted as ``(self[i, j], j)``. + + Examples + ======== + + >>> from sympy import eye + >>> M = eye(3) + >>> M.row_op(1, lambda v, j: v + 2*M[0, j]); M + Matrix([ + [1, 0, 0], + [2, 1, 0], + [0, 0, 1]]) + + See Also + ======== + row + zip_row_op + col_op + + """ + for j in range(self.cols): + self[i, j] = f(self[i, j], j) + + #The next three methods give direct support for the most common row operations inplace. + def row_mult(self,i,factor): + """Multiply the given row by the given factor in-place. + + Examples + ======== + + >>> from sympy import eye + >>> M = eye(3) + >>> M.row_mult(1,7); M + Matrix([ + [1, 0, 0], + [0, 7, 0], + [0, 0, 1]]) + + """ + for j in range(self.cols): + self[i,j] *= factor + + def row_add(self,s,t,k): + """Add k times row s (source) to row t (target) in place. + + Examples + ======== + + >>> from sympy import eye + >>> M = eye(3) + >>> M.row_add(0, 2,3); M + Matrix([ + [1, 0, 0], + [0, 1, 0], + [3, 0, 1]]) + """ + + for j in range(self.cols): + self[t,j] += k*self[s,j] + + def row_swap(self, i, j): + """Swap the two given rows of the matrix in-place. + + Examples + ======== + + >>> from sympy import Matrix + >>> M = Matrix([[0, 1], [1, 0]]) + >>> M + Matrix([ + [0, 1], + [1, 0]]) + >>> M.row_swap(0, 1) + >>> M + Matrix([ + [1, 0], + [0, 1]]) + + See Also + ======== + + row + col_swap + """ + for k in range(0, self.cols): + self[i, k], self[j, k] = self[j, k], self[i, k] + + def zip_row_op(self, i, k, f): + """In-place operation on row ``i`` using two-arg functor whose args are + interpreted as ``(self[i, j], self[k, j])``. + + Examples + ======== + + >>> from sympy import eye + >>> M = eye(3) + >>> M.zip_row_op(1, 0, lambda v, u: v + 2*u); M + Matrix([ + [1, 0, 0], + [2, 1, 0], + [0, 0, 1]]) + + See Also + ======== + row + row_op + col_op + + """ + for j in range(self.cols): + self[i, j] = f(self[i, j], self[k, j]) + + def copyin_list(self, key, value): + """Copy in elements from a list. + + Parameters + ========== + + key : slice + The section of this matrix to replace. + value : iterable + The iterable to copy values from. + + Examples + ======== + + >>> from sympy import eye + >>> I = eye(3) + >>> I[:2, 0] = [1, 2] # col + >>> I + Matrix([ + [1, 0, 0], + [2, 1, 0], + [0, 0, 1]]) + >>> I[1, :2] = [[3, 4]] + >>> I + Matrix([ + [1, 0, 0], + [3, 4, 0], + [0, 0, 1]]) + + See Also + ======== + + copyin_matrix + """ + if not is_sequence(value): + raise TypeError("`value` must be an ordered iterable, not %s." % type(value)) + return self.copyin_matrix(key, type(self)(value)) + + def copyin_matrix(self, key, value): + """Copy in values from a matrix into the given bounds. + + Parameters + ========== + + key : slice + The section of this matrix to replace. + value : Matrix + The matrix to copy values from. + + Examples + ======== + + >>> from sympy import Matrix, eye + >>> M = Matrix([[0, 1], [2, 3], [4, 5]]) + >>> I = eye(3) + >>> I[:3, :2] = M + >>> I + Matrix([ + [0, 1, 0], + [2, 3, 0], + [4, 5, 1]]) + >>> I[0, 1] = M + >>> I + Matrix([ + [0, 0, 1], + [2, 2, 3], + [4, 4, 5]]) + + See Also + ======== + + copyin_list + """ + rlo, rhi, clo, chi = self.key2bounds(key) + shape = value.shape + dr, dc = rhi - rlo, chi - clo + if shape != (dr, dc): + raise ShapeError(filldedent("The Matrix `value` doesn't have the " + "same dimensions " + "as the in sub-Matrix given by `key`.")) + + for i in range(value.rows): + for j in range(value.cols): + self[i + rlo, j + clo] = value[i, j] + + def fill(self, value): + """Fill self with the given value. + + Notes + ===== + + Unless many values are going to be deleted (i.e. set to zero) + this will create a matrix that is slower than a dense matrix in + operations. + + Examples + ======== + + >>> from sympy import SparseMatrix + >>> M = SparseMatrix.zeros(3); M + Matrix([ + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]) + >>> M.fill(1); M + Matrix([ + [1, 1, 1], + [1, 1, 1], + [1, 1, 1]]) + + See Also + ======== + + zeros + ones + """ + value = _sympify(value) + if not value: + self._rep = DomainMatrix.zeros(self.shape, EXRAW) + else: + elements_dod = {i: dict.fromkeys(range(self.cols), value) for i in range(self.rows)} + self._rep = DomainMatrix(elements_dod, self.shape, EXRAW) + + +def _getitem_RepMatrix(self, key): + """Return portion of self defined by key. If the key involves a slice + then a list will be returned (if key is a single slice) or a matrix + (if key was a tuple involving a slice). + + Examples + ======== + + >>> from sympy import Matrix, I + >>> m = Matrix([ + ... [1, 2 + I], + ... [3, 4 ]]) + + If the key is a tuple that does not involve a slice then that element + is returned: + + >>> m[1, 0] + 3 + + When a tuple key involves a slice, a matrix is returned. Here, the + first column is selected (all rows, column 0): + + >>> m[:, 0] + Matrix([ + [1], + [3]]) + + If the slice is not a tuple then it selects from the underlying + list of elements that are arranged in row order and a list is + returned if a slice is involved: + + >>> m[0] + 1 + >>> m[::2] + [1, 3] + """ + if isinstance(key, tuple): + i, j = key + try: + return self._rep.getitem_sympy(index_(i), index_(j)) + except (TypeError, IndexError): + if (isinstance(i, Expr) and not i.is_number) or (isinstance(j, Expr) and not j.is_number): + if ((j < 0) is True) or ((j >= self.shape[1]) is True) or\ + ((i < 0) is True) or ((i >= self.shape[0]) is True): + raise ValueError("index out of boundary") + from sympy.matrices.expressions.matexpr import MatrixElement + return MatrixElement(self, i, j) + + if isinstance(i, slice): + i = range(self.rows)[i] + elif is_sequence(i): + pass + else: + i = [i] + if isinstance(j, slice): + j = range(self.cols)[j] + elif is_sequence(j): + pass + else: + j = [j] + return self.extract(i, j) + + else: + # Index/slice like a flattened list + rows, cols = self.shape + + # Raise the appropriate exception: + if not rows * cols: + return [][key] + + rep = self._rep.rep + domain = rep.domain + is_slice = isinstance(key, slice) + + if is_slice: + values = [rep.getitem(*divmod(n, cols)) for n in range(rows * cols)[key]] + else: + values = [rep.getitem(*divmod(index_(key), cols))] + + if domain != EXRAW: + to_sympy = domain.to_sympy + values = [to_sympy(val) for val in values] + + if is_slice: + return values + else: + return values[0] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/solvers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/solvers.py new file mode 100644 index 0000000000000000000000000000000000000000..01a85b29bec04981854233fb6ad393685bf793e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/solvers.py @@ -0,0 +1,942 @@ +from sympy.core.function import expand_mul +from sympy.core.symbol import Dummy, uniquely_named_symbol, symbols +from sympy.utilities.iterables import numbered_symbols + +from .exceptions import ShapeError, NonSquareMatrixError, NonInvertibleMatrixError +from .eigen import _fuzzy_positive_definite +from .utilities import _get_intermediate_simp, _iszero + + +def _diagonal_solve(M, rhs): + """Solves ``Ax = B`` efficiently, where A is a diagonal Matrix, + with non-zero diagonal entries. + + Examples + ======== + + >>> from sympy import Matrix, eye + >>> A = eye(2)*2 + >>> B = Matrix([[1, 2], [3, 4]]) + >>> A.diagonal_solve(B) == B/2 + True + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + gauss_jordan_solve + cholesky_solve + LDLsolve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + if not M.is_diagonal(): + raise TypeError("Matrix should be diagonal") + if rhs.rows != M.rows: + raise TypeError("Size mismatch") + + return M._new( + rhs.rows, rhs.cols, lambda i, j: rhs[i, j] / M[i, i]) + + +def _lower_triangular_solve(M, rhs): + """Solves ``Ax = B``, where A is a lower triangular matrix. + + See Also + ======== + + upper_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + from .dense import MutableDenseMatrix + + if not M.is_square: + raise NonSquareMatrixError("Matrix must be square.") + if rhs.rows != M.rows: + raise ShapeError("Matrices size mismatch.") + if not M.is_lower: + raise ValueError("Matrix must be lower triangular.") + + dps = _get_intermediate_simp() + X = MutableDenseMatrix.zeros(M.rows, rhs.cols) + + for j in range(rhs.cols): + for i in range(M.rows): + if M[i, i] == 0: + raise TypeError("Matrix must be non-singular.") + + X[i, j] = dps((rhs[i, j] - sum(M[i, k]*X[k, j] + for k in range(i))) / M[i, i]) + + return M._new(X) + +def _lower_triangular_solve_sparse(M, rhs): + """Solves ``Ax = B``, where A is a lower triangular matrix. + + See Also + ======== + + upper_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + if not M.is_square: + raise NonSquareMatrixError("Matrix must be square.") + if rhs.rows != M.rows: + raise ShapeError("Matrices size mismatch.") + if not M.is_lower: + raise ValueError("Matrix must be lower triangular.") + + dps = _get_intermediate_simp() + rows = [[] for i in range(M.rows)] + + for i, j, v in M.row_list(): + if i > j: + rows[i].append((j, v)) + + X = rhs.as_mutable() + + for j in range(rhs.cols): + for i in range(rhs.rows): + for u, v in rows[i]: + X[i, j] -= v*X[u, j] + + X[i, j] = dps(X[i, j] / M[i, i]) + + return M._new(X) + + +def _upper_triangular_solve(M, rhs): + """Solves ``Ax = B``, where A is an upper triangular matrix. + + See Also + ======== + + lower_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + from .dense import MutableDenseMatrix + + if not M.is_square: + raise NonSquareMatrixError("Matrix must be square.") + if rhs.rows != M.rows: + raise ShapeError("Matrix size mismatch.") + if not M.is_upper: + raise TypeError("Matrix is not upper triangular.") + + dps = _get_intermediate_simp() + X = MutableDenseMatrix.zeros(M.rows, rhs.cols) + + for j in range(rhs.cols): + for i in reversed(range(M.rows)): + if M[i, i] == 0: + raise ValueError("Matrix must be non-singular.") + + X[i, j] = dps((rhs[i, j] - sum(M[i, k]*X[k, j] + for k in range(i + 1, M.rows))) / M[i, i]) + + return M._new(X) + +def _upper_triangular_solve_sparse(M, rhs): + """Solves ``Ax = B``, where A is an upper triangular matrix. + + See Also + ======== + + lower_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + if not M.is_square: + raise NonSquareMatrixError("Matrix must be square.") + if rhs.rows != M.rows: + raise ShapeError("Matrix size mismatch.") + if not M.is_upper: + raise TypeError("Matrix is not upper triangular.") + + dps = _get_intermediate_simp() + rows = [[] for i in range(M.rows)] + + for i, j, v in M.row_list(): + if i < j: + rows[i].append((j, v)) + + X = rhs.as_mutable() + + for j in range(rhs.cols): + for i in reversed(range(rhs.rows)): + for u, v in reversed(rows[i]): + X[i, j] -= v*X[u, j] + + X[i, j] = dps(X[i, j] / M[i, i]) + + return M._new(X) + + +def _cholesky_solve(M, rhs): + """Solves ``Ax = B`` using Cholesky decomposition, + for a general square non-singular matrix. + For a non-square matrix with rows > cols, + the least squares solution is returned. + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + gauss_jordan_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + if M.rows < M.cols: + raise NotImplementedError( + 'Under-determined System. Try M.gauss_jordan_solve(rhs)') + + hermitian = True + reform = False + + if M.is_symmetric(): + hermitian = False + elif not M.is_hermitian: + reform = True + + if reform or _fuzzy_positive_definite(M) is False: + H = M.H + M = H.multiply(M) + rhs = H.multiply(rhs) + hermitian = not M.is_symmetric() + + L = M.cholesky(hermitian=hermitian) + Y = L.lower_triangular_solve(rhs) + + if hermitian: + return (L.H).upper_triangular_solve(Y) + else: + return (L.T).upper_triangular_solve(Y) + + +def _LDLsolve(M, rhs): + """Solves ``Ax = B`` using LDL decomposition, + for a general square and non-singular matrix. + + For a non-square matrix with rows > cols, + the least squares solution is returned. + + Examples + ======== + + >>> from sympy import Matrix, eye + >>> A = eye(2)*2 + >>> B = Matrix([[1, 2], [3, 4]]) + >>> A.LDLsolve(B) == B/2 + True + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.LDLdecomposition + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LUsolve + QRsolve + pinv_solve + cramer_solve + """ + + if M.rows < M.cols: + raise NotImplementedError( + 'Under-determined System. Try M.gauss_jordan_solve(rhs)') + + hermitian = True + reform = False + + if M.is_symmetric(): + hermitian = False + elif not M.is_hermitian: + reform = True + + if reform or _fuzzy_positive_definite(M) is False: + H = M.H + M = H.multiply(M) + rhs = H.multiply(rhs) + hermitian = not M.is_symmetric() + + L, D = M.LDLdecomposition(hermitian=hermitian) + Y = L.lower_triangular_solve(rhs) + Z = D.diagonal_solve(Y) + + if hermitian: + return (L.H).upper_triangular_solve(Z) + else: + return (L.T).upper_triangular_solve(Z) + + +def _LUsolve(M, rhs, iszerofunc=_iszero): + """Solve the linear system ``Ax = rhs`` for ``x`` where ``A = M``. + + This is for symbolic matrices, for real or complex ones use + mpmath.lu_solve or mpmath.qr_solve. + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + QRsolve + pinv_solve + LUdecomposition + cramer_solve + """ + + if rhs.rows != M.rows: + raise ShapeError( + "``M`` and ``rhs`` must have the same number of rows.") + + m = M.rows + n = M.cols + + if m < n: + raise NotImplementedError("Underdetermined systems not supported.") + + try: + A, perm = M.LUdecomposition_Simple( + iszerofunc=iszerofunc, rankcheck=True) + except ValueError: + raise NonInvertibleMatrixError("Matrix det == 0; not invertible.") + + dps = _get_intermediate_simp() + b = rhs.permute_rows(perm).as_mutable() + + # forward substitution, all diag entries are scaled to 1 + for i in range(m): + for j in range(min(i, n)): + scale = A[i, j] + b.zip_row_op(i, j, lambda x, y: dps(x - y * scale)) + + # consistency check for overdetermined systems + if m > n: + for i in range(n, m): + for j in range(b.cols): + if not iszerofunc(b[i, j]): + raise ValueError("The system is inconsistent.") + + b = b[0:n, :] # truncate zero rows if consistent + + # backward substitution + for i in range(n - 1, -1, -1): + for j in range(i + 1, n): + scale = A[i, j] + b.zip_row_op(i, j, lambda x, y: dps(x - y * scale)) + + scale = A[i, i] + b.row_op(i, lambda x, _: dps(x / scale)) + + return rhs.__class__(b) + + +def _QRsolve(M, b): + """Solve the linear system ``Ax = b``. + + ``M`` is the matrix ``A``, the method argument is the vector + ``b``. The method returns the solution vector ``x``. If ``b`` is a + matrix, the system is solved for each column of ``b`` and the + return value is a matrix of the same shape as ``b``. + + This method is slower (approximately by a factor of 2) but + more stable for floating-point arithmetic than the LUsolve method. + However, LUsolve usually uses an exact arithmetic, so you do not need + to use QRsolve. + + This is mainly for educational purposes and symbolic matrices, for real + (or complex) matrices use mpmath.qr_solve. + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + pinv_solve + QRdecomposition + cramer_solve + """ + + dps = _get_intermediate_simp(expand_mul, expand_mul) + Q, R = M.QRdecomposition() + y = Q.T * b + + # back substitution to solve R*x = y: + # We build up the result "backwards" in the vector 'x' and reverse it + # only in the end. + x = [] + n = R.rows + + for j in range(n - 1, -1, -1): + tmp = y[j, :] + + for k in range(j + 1, n): + tmp -= R[j, k] * x[n - 1 - k] + + tmp = dps(tmp) + + x.append(tmp / R[j, j]) + + return M.vstack(*x[::-1]) + + +def _gauss_jordan_solve(M, B, freevar=False): + """ + Solves ``Ax = B`` using Gauss Jordan elimination. + + There may be zero, one, or infinite solutions. If one solution + exists, it will be returned. If infinite solutions exist, it will + be returned parametrically. If no solutions exist, It will throw + ValueError. + + Parameters + ========== + + B : Matrix + The right hand side of the equation to be solved for. Must have + the same number of rows as matrix A. + + freevar : boolean, optional + Flag, when set to `True` will return the indices of the free + variables in the solutions (column Matrix), for a system that is + undetermined (e.g. A has more columns than rows), for which + infinite solutions are possible, in terms of arbitrary + values of free variables. Default `False`. + + Returns + ======= + + x : Matrix + The matrix that will satisfy ``Ax = B``. Will have as many rows as + matrix A has columns, and as many columns as matrix B. + + params : Matrix + If the system is underdetermined (e.g. A has more columns than + rows), infinite solutions are possible, in terms of arbitrary + parameters. These arbitrary parameters are returned as params + Matrix. + + free_var_index : List, optional + If the system is underdetermined (e.g. A has more columns than + rows), infinite solutions are possible, in terms of arbitrary + values of free variables. Then the indices of the free variables + in the solutions (column Matrix) are returned by free_var_index, + if the flag `freevar` is set to `True`. + + Examples + ======== + + >>> from sympy import Matrix + >>> A = Matrix([[1, 2, 1, 1], [1, 2, 2, -1], [2, 4, 0, 6]]) + >>> B = Matrix([7, 12, 4]) + >>> sol, params = A.gauss_jordan_solve(B) + >>> sol + Matrix([ + [-2*tau0 - 3*tau1 + 2], + [ tau0], + [ 2*tau1 + 5], + [ tau1]]) + >>> params + Matrix([ + [tau0], + [tau1]]) + >>> taus_zeroes = { tau:0 for tau in params } + >>> sol_unique = sol.xreplace(taus_zeroes) + >>> sol_unique + Matrix([ + [2], + [0], + [5], + [0]]) + + + >>> A = Matrix([[1, 2, 3], [4, 5, 6], [7, 8, 10]]) + >>> B = Matrix([3, 6, 9]) + >>> sol, params = A.gauss_jordan_solve(B) + >>> sol + Matrix([ + [-1], + [ 2], + [ 0]]) + >>> params + Matrix(0, 1, []) + + >>> A = Matrix([[2, -7], [-1, 4]]) + >>> B = Matrix([[-21, 3], [12, -2]]) + >>> sol, params = A.gauss_jordan_solve(B) + >>> sol + Matrix([ + [0, -2], + [3, -1]]) + >>> params + Matrix(0, 2, []) + + + >>> from sympy import Matrix + >>> A = Matrix([[1, 2, 1, 1], [1, 2, 2, -1], [2, 4, 0, 6]]) + >>> B = Matrix([7, 12, 4]) + >>> sol, params, freevars = A.gauss_jordan_solve(B, freevar=True) + >>> sol + Matrix([ + [-2*tau0 - 3*tau1 + 2], + [ tau0], + [ 2*tau1 + 5], + [ tau1]]) + >>> params + Matrix([ + [tau0], + [tau1]]) + >>> freevars + [1, 3] + + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Gaussian_elimination + + """ + + from sympy.matrices import Matrix, zeros + + cls = M.__class__ + aug = M.hstack(M.copy(), B.copy()) + B_cols = B.cols + row, col = aug[:, :-B_cols].shape + + # solve by reduced row echelon form + A, pivots = aug.rref(simplify=True) + A, v = A[:, :-B_cols], A[:, -B_cols:] + pivots = list(filter(lambda p: p < col, pivots)) + rank = len(pivots) + + # Get index of free symbols (free parameters) + # non-pivots columns are free variables + free_var_index = [c for c in range(A.cols) if c not in pivots] + + # Bring to block form + permutation = Matrix(pivots + free_var_index).T + + # check for existence of solutions + # rank of aug Matrix should be equal to rank of coefficient matrix + if not v[rank:, :].is_zero_matrix: + raise ValueError("Linear system has no solution") + + # Free parameters + # what are current unnumbered free symbol names? + name = uniquely_named_symbol('tau', [aug], + compare=lambda i: str(i).rstrip('1234567890'), + modify=lambda s: '_' + s).name + gen = numbered_symbols(name) + tau = Matrix([next(gen) for k in range((col - rank)*B_cols)]).reshape( + col - rank, B_cols) + + # Full parametric solution + V = A[:rank, free_var_index] + vt = v[:rank, :] + free_sol = tau.vstack(vt - V * tau, tau) + + # Undo permutation + sol = zeros(col, B_cols) + + for k in range(col): + sol[permutation[k], :] = free_sol[k,:] + + sol, tau = cls(sol), cls(tau) + + if freevar: + return sol, tau, free_var_index + else: + return sol, tau + + +def _pinv_solve(M, B, arbitrary_matrix=None): + """Solve ``Ax = B`` using the Moore-Penrose pseudoinverse. + + There may be zero, one, or infinite solutions. If one solution + exists, it will be returned. If infinite solutions exist, one will + be returned based on the value of arbitrary_matrix. If no solutions + exist, the least-squares solution is returned. + + Parameters + ========== + + B : Matrix + The right hand side of the equation to be solved for. Must have + the same number of rows as matrix A. + arbitrary_matrix : Matrix + If the system is underdetermined (e.g. A has more columns than + rows), infinite solutions are possible, in terms of an arbitrary + matrix. This parameter may be set to a specific matrix to use + for that purpose; if so, it must be the same shape as x, with as + many rows as matrix A has columns, and as many columns as matrix + B. If left as None, an appropriate matrix containing dummy + symbols in the form of ``wn_m`` will be used, with n and m being + row and column position of each symbol. + + Returns + ======= + + x : Matrix + The matrix that will satisfy ``Ax = B``. Will have as many rows as + matrix A has columns, and as many columns as matrix B. + + Examples + ======== + + >>> from sympy import Matrix + >>> A = Matrix([[1, 2, 3], [4, 5, 6]]) + >>> B = Matrix([7, 8]) + >>> A.pinv_solve(B) + Matrix([ + [ _w0_0/6 - _w1_0/3 + _w2_0/6 - 55/18], + [-_w0_0/3 + 2*_w1_0/3 - _w2_0/3 + 1/9], + [ _w0_0/6 - _w1_0/3 + _w2_0/6 + 59/18]]) + >>> A.pinv_solve(B, arbitrary_matrix=Matrix([0, 0, 0])) + Matrix([ + [-55/18], + [ 1/9], + [ 59/18]]) + + See Also + ======== + + sympy.matrices.dense.DenseMatrix.lower_triangular_solve + sympy.matrices.dense.DenseMatrix.upper_triangular_solve + gauss_jordan_solve + cholesky_solve + diagonal_solve + LDLsolve + LUsolve + QRsolve + pinv + + Notes + ===== + + This may return either exact solutions or least squares solutions. + To determine which, check ``A * A.pinv() * B == B``. It will be + True if exact solutions exist, and False if only a least-squares + solution exists. Be aware that the left hand side of that equation + may need to be simplified to correctly compare to the right hand + side. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Moore-Penrose_pseudoinverse#Obtaining_all_solutions_of_a_linear_system + + """ + + from sympy.matrices import eye + + A = M + A_pinv = M.pinv() + + if arbitrary_matrix is None: + rows, cols = A.cols, B.cols + w = symbols('w:{}_:{}'.format(rows, cols), cls=Dummy) + arbitrary_matrix = M.__class__(cols, rows, w).T + + return A_pinv.multiply(B) + (eye(A.cols) - + A_pinv.multiply(A)).multiply(arbitrary_matrix) + + +def _cramer_solve(M, rhs, det_method="laplace"): + """Solves system of linear equations using Cramer's rule. + + This method is relatively inefficient compared to other methods. + However it only uses a single division, assuming a division-free determinant + method is provided. This is helpful to minimize the chance of divide-by-zero + cases in symbolic solutions to linear systems. + + Parameters + ========== + M : Matrix + The matrix representing the left hand side of the equation. + rhs : Matrix + The matrix representing the right hand side of the equation. + det_method : str or callable + The method to use to calculate the determinant of the matrix. + The default is ``'laplace'``. If a callable is passed, it should take a + single argument, the matrix, and return the determinant of the matrix. + + Returns + ======= + x : Matrix + The matrix that will satisfy ``Ax = B``. Will have as many rows as + matrix A has columns, and as many columns as matrix B. + + Examples + ======== + + >>> from sympy import Matrix + >>> A = Matrix([[0, -6, 1], [0, -6, -1], [-5, -2, 3]]) + >>> B = Matrix([[-30, -9], [-18, -27], [-26, 46]]) + >>> x = A.cramer_solve(B) + >>> x + Matrix([ + [ 0, -5], + [ 4, 3], + [-6, 9]]) + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Cramer%27s_rule#Explicit_formulas_for_small_systems + + """ + from .dense import zeros + + def entry(i, j): + return rhs[i, sol] if j == col else M[i, j] + + if det_method == "bird": + from .determinant import _det_bird + det = _det_bird + elif det_method == "laplace": + from .determinant import _det_laplace + det = _det_laplace + elif isinstance(det_method, str): + det = lambda matrix: matrix.det(method=det_method) + else: + det = det_method + det_M = det(M) + x = zeros(*rhs.shape) + for sol in range(rhs.shape[1]): + for col in range(rhs.shape[0]): + x[col, sol] = det(M.__class__(*M.shape, entry)) / det_M + return M.__class__(x) + + +def _solve(M, rhs, method='GJ'): + """Solves linear equation where the unique solution exists. + + Parameters + ========== + + rhs : Matrix + Vector representing the right hand side of the linear equation. + + method : string, optional + If set to ``'GJ'`` or ``'GE'``, the Gauss-Jordan elimination will be + used, which is implemented in the routine ``gauss_jordan_solve``. + + If set to ``'LU'``, ``LUsolve`` routine will be used. + + If set to ``'QR'``, ``QRsolve`` routine will be used. + + If set to ``'PINV'``, ``pinv_solve`` routine will be used. + + If set to ``'CRAMER'``, ``cramer_solve`` routine will be used. + + It also supports the methods available for special linear systems + + For positive definite systems: + + If set to ``'CH'``, ``cholesky_solve`` routine will be used. + + If set to ``'LDL'``, ``LDLsolve`` routine will be used. + + To use a different method and to compute the solution via the + inverse, use a method defined in the .inv() docstring. + + Returns + ======= + + solutions : Matrix + Vector representing the solution. + + Raises + ====== + + ValueError + If there is not a unique solution then a ``ValueError`` will be + raised. + + If ``M`` is not square, a ``ValueError`` and a different routine + for solving the system will be suggested. + """ + + if method in ('GJ', 'GE'): + try: + soln, param = M.gauss_jordan_solve(rhs) + + if param: + raise NonInvertibleMatrixError("Matrix det == 0; not invertible. " + "Try ``M.gauss_jordan_solve(rhs)`` to obtain a parametric solution.") + + except ValueError: + raise NonInvertibleMatrixError("Matrix det == 0; not invertible.") + + return soln + + elif method == 'LU': + return M.LUsolve(rhs) + elif method == 'CH': + return M.cholesky_solve(rhs) + elif method == 'QR': + return M.QRsolve(rhs) + elif method == 'LDL': + return M.LDLsolve(rhs) + elif method == 'PINV': + return M.pinv_solve(rhs) + elif method == 'CRAMER': + return M.cramer_solve(rhs) + else: + return M.inv(method=method).multiply(rhs) + + +def _solve_least_squares(M, rhs, method='CH'): + """Return the least-square fit to the data. + + Parameters + ========== + + rhs : Matrix + Vector representing the right hand side of the linear equation. + + method : string or boolean, optional + If set to ``'CH'``, ``cholesky_solve`` routine will be used. + + If set to ``'LDL'``, ``LDLsolve`` routine will be used. + + If set to ``'QR'``, ``QRsolve`` routine will be used. + + If set to ``'PINV'``, ``pinv_solve`` routine will be used. + + Otherwise, the conjugate of ``M`` will be used to create a system + of equations that is passed to ``solve`` along with the hint + defined by ``method``. + + Returns + ======= + + solutions : Matrix + Vector representing the solution. + + Examples + ======== + + >>> from sympy import Matrix, ones + >>> A = Matrix([1, 2, 3]) + >>> B = Matrix([2, 3, 4]) + >>> S = Matrix(A.row_join(B)) + >>> S + Matrix([ + [1, 2], + [2, 3], + [3, 4]]) + + If each line of S represent coefficients of Ax + By + and x and y are [2, 3] then S*xy is: + + >>> r = S*Matrix([2, 3]); r + Matrix([ + [ 8], + [13], + [18]]) + + But let's add 1 to the middle value and then solve for the + least-squares value of xy: + + >>> xy = S.solve_least_squares(Matrix([8, 14, 18])); xy + Matrix([ + [ 5/3], + [10/3]]) + + The error is given by S*xy - r: + + >>> S*xy - r + Matrix([ + [1/3], + [1/3], + [1/3]]) + >>> _.norm().n(2) + 0.58 + + If a different xy is used, the norm will be higher: + + >>> xy += ones(2, 1)/10 + >>> (S*xy - r).norm().n(2) + 1.5 + + """ + + if method == 'CH': + return M.cholesky_solve(rhs) + elif method == 'QR': + return M.QRsolve(rhs) + elif method == 'LDL': + return M.LDLsolve(rhs) + elif method == 'PINV': + return M.pinv_solve(rhs) + else: + t = M.H + return (t * M).solve(t * rhs, method=method) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/sparsetools.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/sparsetools.py new file mode 100644 index 0000000000000000000000000000000000000000..50048f6dc7e5cf160366963d16427987616ddce7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/sparsetools.py @@ -0,0 +1,300 @@ +from sympy.core.containers import Dict +from sympy.core.symbol import Dummy +from sympy.utilities.iterables import is_sequence +from sympy.utilities.misc import as_int, filldedent + +from .sparse import MutableSparseMatrix as SparseMatrix + + +def _doktocsr(dok): + """Converts a sparse matrix to Compressed Sparse Row (CSR) format. + + Parameters + ========== + + A : contains non-zero elements sorted by key (row, column) + JA : JA[i] is the column corresponding to A[i] + IA : IA[i] contains the index in A for the first non-zero element + of row[i]. Thus IA[i+1] - IA[i] gives number of non-zero + elements row[i]. The length of IA is always 1 more than the + number of rows in the matrix. + + Examples + ======== + + >>> from sympy.matrices.sparsetools import _doktocsr + >>> from sympy import SparseMatrix, diag + >>> m = SparseMatrix(diag(1, 2, 3)) + >>> m[2, 0] = -1 + >>> _doktocsr(m) + [[1, 2, -1, 3], [0, 1, 0, 2], [0, 1, 2, 4], [3, 3]] + + """ + row, JA, A = [list(i) for i in zip(*dok.row_list())] + IA = [0]*((row[0] if row else 0) + 1) + for i, r in enumerate(row): + IA.extend([i]*(r - row[i - 1])) # if i = 0 nothing is extended + IA.extend([len(A)]*(dok.rows - len(IA) + 1)) + shape = [dok.rows, dok.cols] + return [A, JA, IA, shape] + + +def _csrtodok(csr): + """Converts a CSR representation to DOK representation. + + Examples + ======== + + >>> from sympy.matrices.sparsetools import _csrtodok + >>> _csrtodok([[5, 8, 3, 6], [0, 1, 2, 1], [0, 0, 2, 3, 4], [4, 3]]) + Matrix([ + [0, 0, 0], + [5, 8, 0], + [0, 0, 3], + [0, 6, 0]]) + + """ + smat = {} + A, JA, IA, shape = csr + for i in range(len(IA) - 1): + indices = slice(IA[i], IA[i + 1]) + for l, m in zip(A[indices], JA[indices]): + smat[i, m] = l + return SparseMatrix(*shape, smat) + + +def banded(*args, **kwargs): + """Returns a SparseMatrix from the given dictionary describing + the diagonals of the matrix. The keys are positive for upper + diagonals and negative for those below the main diagonal. The + values may be: + + * expressions or single-argument functions, + + * lists or tuples of values, + + * matrices + + Unless dimensions are given, the size of the returned matrix will + be large enough to contain the largest non-zero value provided. + + kwargs + ====== + + rows : rows of the resulting matrix; computed if + not given. + + cols : columns of the resulting matrix; computed if + not given. + + Examples + ======== + + >>> from sympy import banded, ones, Matrix + >>> from sympy.abc import x + + If explicit values are given in tuples, + the matrix will autosize to contain all values, otherwise + a single value is filled onto the entire diagonal: + + >>> banded({1: (1, 2, 3), -1: (4, 5, 6), 0: x}) + Matrix([ + [x, 1, 0, 0], + [4, x, 2, 0], + [0, 5, x, 3], + [0, 0, 6, x]]) + + A function accepting a single argument can be used to fill the + diagonal as a function of diagonal index (which starts at 0). + The size (or shape) of the matrix must be given to obtain more + than a 1x1 matrix: + + >>> s = lambda d: (1 + d)**2 + >>> banded(5, {0: s, 2: s, -2: 2}) + Matrix([ + [1, 0, 1, 0, 0], + [0, 4, 0, 4, 0], + [2, 0, 9, 0, 9], + [0, 2, 0, 16, 0], + [0, 0, 2, 0, 25]]) + + The diagonal of matrices placed on a diagonal will coincide + with the indicated diagonal: + + >>> vert = Matrix([1, 2, 3]) + >>> banded({0: vert}, cols=3) + Matrix([ + [1, 0, 0], + [2, 1, 0], + [3, 2, 1], + [0, 3, 2], + [0, 0, 3]]) + + >>> banded(4, {0: ones(2)}) + Matrix([ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1]]) + + Errors are raised if the designated size will not hold + all values an integral number of times. Here, the rows + are designated as odd (but an even number is required to + hold the off-diagonal 2x2 ones): + + >>> banded({0: 2, 1: ones(2)}, rows=5) + Traceback (most recent call last): + ... + ValueError: + sequence does not fit an integral number of times in the matrix + + And here, an even number of rows is given...but the square + matrix has an even number of columns, too. As we saw + in the previous example, an odd number is required: + + >>> banded(4, {0: 2, 1: ones(2)}) # trying to make 4x4 and cols must be odd + Traceback (most recent call last): + ... + ValueError: + sequence does not fit an integral number of times in the matrix + + A way around having to count rows is to enclosing matrix elements + in a tuple and indicate the desired number of them to the right: + + >>> banded({0: 2, 2: (ones(2),)*3}) + Matrix([ + [2, 0, 1, 1, 0, 0, 0, 0], + [0, 2, 1, 1, 0, 0, 0, 0], + [0, 0, 2, 0, 1, 1, 0, 0], + [0, 0, 0, 2, 1, 1, 0, 0], + [0, 0, 0, 0, 2, 0, 1, 1], + [0, 0, 0, 0, 0, 2, 1, 1]]) + + An error will be raised if more than one value + is written to a given entry. Here, the ones overlap + with the main diagonal if they are placed on the + first diagonal: + + >>> banded({0: (2,)*5, 1: (ones(2),)*3}) + Traceback (most recent call last): + ... + ValueError: collision at (1, 1) + + By placing a 0 at the bottom left of the 2x2 matrix of + ones, the collision is avoided: + + >>> u2 = Matrix([ + ... [1, 1], + ... [0, 1]]) + >>> banded({0: [2]*5, 1: [u2]*3}) + Matrix([ + [2, 1, 1, 0, 0, 0, 0], + [0, 2, 1, 0, 0, 0, 0], + [0, 0, 2, 1, 1, 0, 0], + [0, 0, 0, 2, 1, 0, 0], + [0, 0, 0, 0, 2, 1, 1], + [0, 0, 0, 0, 0, 0, 1]]) + """ + try: + if len(args) not in (1, 2, 3): + raise TypeError + if not isinstance(args[-1], (dict, Dict)): + raise TypeError + if len(args) == 1: + rows = kwargs.get('rows', None) + cols = kwargs.get('cols', None) + if rows is not None: + rows = as_int(rows) + if cols is not None: + cols = as_int(cols) + elif len(args) == 2: + rows = cols = as_int(args[0]) + else: + rows, cols = map(as_int, args[:2]) + # fails with ValueError if any keys are not ints + _ = all(as_int(k) for k in args[-1]) + except (ValueError, TypeError): + raise TypeError(filldedent( + '''unrecognized input to banded: + expecting [[row,] col,] {int: value}''')) + def rc(d): + # return row,col coord of diagonal start + r = -d if d < 0 else 0 + c = 0 if r else d + return r, c + smat = {} + undone = [] + tba = Dummy() + # first handle objects with size + for d, v in args[-1].items(): + r, c = rc(d) + # note: only list and tuple are recognized since this + # will allow other Basic objects like Tuple + # into the matrix if so desired + if isinstance(v, (list, tuple)): + extra = 0 + for i, vi in enumerate(v): + i += extra + if is_sequence(vi): + vi = SparseMatrix(vi) + smat[r + i, c + i] = vi + extra += min(vi.shape) - 1 + else: + smat[r + i, c + i] = vi + elif is_sequence(v): + v = SparseMatrix(v) + rv, cv = v.shape + if rows and cols: + nr, xr = divmod(rows - r, rv) + nc, xc = divmod(cols - c, cv) + x = xr or xc + do = min(nr, nc) + elif rows: + do, x = divmod(rows - r, rv) + elif cols: + do, x = divmod(cols - c, cv) + else: + do = 1 + x = 0 + if x: + raise ValueError(filldedent(''' + sequence does not fit an integral number of times + in the matrix''')) + j = min(v.shape) + for i in range(do): + smat[r, c] = v + r += j + c += j + elif v: + smat[r, c] = tba + undone.append((d, v)) + s = SparseMatrix(None, smat) # to expand matrices + smat = s.todok() + # check for dim errors here + if rows is not None and rows < s.rows: + raise ValueError('Designated rows %s < needed %s' % (rows, s.rows)) + if cols is not None and cols < s.cols: + raise ValueError('Designated cols %s < needed %s' % (cols, s.cols)) + if rows is cols is None: + rows = s.rows + cols = s.cols + elif rows is not None and cols is None: + cols = max(rows, s.cols) + elif cols is not None and rows is None: + rows = max(cols, s.rows) + def update(i, j, v): + # update smat and make sure there are + # no collisions + if v: + if (i, j) in smat and smat[i, j] not in (tba, v): + raise ValueError('collision at %s' % ((i, j),)) + smat[i, j] = v + if undone: + for d, vi in undone: + r, c = rc(d) + v = vi if callable(vi) else lambda _: vi + i = 0 + while r + i < rows and c + i < cols: + update(r + i, c + i, v(i)) + i += 1 + return SparseMatrix(rows, cols, smat) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/utilities.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..b8a680b47e63615e210e561639a192ba47c642d3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/sympy/matrices/utilities.py @@ -0,0 +1,72 @@ +from contextlib import contextmanager +from threading import local + +from sympy.core.function import expand_mul + + +class DotProdSimpState(local): + def __init__(self): + self.state = None + +_dotprodsimp_state = DotProdSimpState() + +@contextmanager +def dotprodsimp(x): + old = _dotprodsimp_state.state + + try: + _dotprodsimp_state.state = x + yield + finally: + _dotprodsimp_state.state = old + + +def _dotprodsimp(expr, withsimp=False): + """Wrapper for simplify.dotprodsimp to avoid circular imports.""" + from sympy.simplify.simplify import dotprodsimp as dps + return dps(expr, withsimp=withsimp) + + +def _get_intermediate_simp(deffunc=lambda x: x, offfunc=lambda x: x, + onfunc=_dotprodsimp, dotprodsimp=None): + """Support function for controlling intermediate simplification. Returns a + simplification function according to the global setting of dotprodsimp + operation. + + ``deffunc`` - Function to be used by default. + ``offfunc`` - Function to be used if dotprodsimp has been turned off. + ``onfunc`` - Function to be used if dotprodsimp has been turned on. + ``dotprodsimp`` - True, False or None. Will be overridden by global + _dotprodsimp_state.state if that is not None. + """ + + if dotprodsimp is False or _dotprodsimp_state.state is False: + return offfunc + if dotprodsimp is True or _dotprodsimp_state.state is True: + return onfunc + + return deffunc # None, None + + +def _get_intermediate_simp_bool(default=False, dotprodsimp=None): + """Same as ``_get_intermediate_simp`` but returns bools instead of functions + by default.""" + + return _get_intermediate_simp(default, False, True, dotprodsimp) + + +def _iszero(x): + """Returns True if x is zero.""" + return getattr(x, 'is_zero', None) + + +def _is_zero_after_expand_mul(x): + """Tests by expand_mul only, suitable for polynomials and rational + functions.""" + return expand_mul(x) == 0 + + +def _simplify(expr): + """ Wrapper to avoid circular imports. """ + from sympy.simplify.simplify import simplify + return simplify(expr) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/common/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/common/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dfb6f8870e48aff3f0e195ba7820fa9d68235fb2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/common/__init__.py @@ -0,0 +1,3 @@ +from .build import _build, cuda_include_dir, libcuda_dirs + +__all__ = ["_build", "libcuda_dirs", "cuda_include_dir"] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/compiler/make_launcher.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/compiler/make_launcher.py new file mode 100644 index 0000000000000000000000000000000000000000..52a8f74a11eb34ef8a571775ddc03db6e39577b3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/compiler/make_launcher.py @@ -0,0 +1,297 @@ +import hashlib +import os +import tempfile + +from ..common import _build +from ..common.backend import get_cuda_version_key +from ..common.build import is_hip +from ..runtime.cache import get_cache_manager +from .utils import generate_cu_signature + +# ----- stub -------- + + +def make_so_cache_key(version_hash, signature, constants, ids, **kwargs): + # Get unique key for the compiled code + signature = {k: 'ptr' if v[0] == '*' else v for k, v in signature.items()} + key = f"{version_hash}-{''.join(signature.values())}-{constants}-{ids}" + for kw in kwargs: + key = f"{key}-{kwargs.get(kw)}" + key = hashlib.md5(key.encode("utf-8")).hexdigest() + return key + + +def make_stub(name, signature, constants, ids, **kwargs): + # name of files that are cached + so_cache_key = make_so_cache_key(get_cuda_version_key(), signature, constants, ids, **kwargs) + so_cache_manager = get_cache_manager(so_cache_key) + so_name = f"{name}.so" + # retrieve stub from cache if it exists + cache_path = so_cache_manager.get_file(so_name) + if cache_path is None: + with tempfile.TemporaryDirectory() as tmpdir: + src = generate_launcher(constants, signature, ids) + src_path = os.path.join(tmpdir, "main.c") + with open(src_path, "w") as f: + f.write(src) + so = _build(name, src_path, tmpdir) + with open(so, "rb") as f: + return so_cache_manager.put(f.read(), so_name, binary=True) + else: + return cache_path + + +# ----- source code generation -------- + + +def ty_to_cpp(ty): + if ty[0] == '*': + return "hipDeviceptr_t" if is_hip() else "CUdeviceptr" + return { + "i1": "int32_t", + "i8": "int8_t", + "i16": "int16_t", + "i32": "int32_t", + "i64": "int64_t", + "u32": "uint32_t", + "u64": "uint64_t", + "fp16": "float", + "bf16": "float", + "fp32": "float", + "f32": "float", + "fp64": "double", + }[ty] + + +def generate_launcher(constants, signature, ids): + # Record the end of regular arguments; + # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA. + signature, desc_start_idx = generate_cu_signature(constants, signature, ids) + arg_decls = ', '.join(f"{ty_to_cpp(ty)} arg{i}" for i, ty in signature.items()) + + def _extracted_type(ty): + if ty[0] == '*': + return "PyObject*" + return { + 'i1': 'int32_t', + 'i32': 'int32_t', + 'i64': 'int64_t', + 'u32': 'uint32_t', + 'u64': 'uint64_t', + 'fp16': 'float', + 'bf16': 'float', + 'fp32': 'float', + 'f32': 'float', + 'fp64': 'double', + }[ty] + + def format_of(ty): + return { + "PyObject*": "O", + "float": "f", + "double": "d", + "long": "l", + "uint32_t": "I", + "int32_t": "i", + "uint64_t": "K", + "int64_t": "L", + }[ty] + + format = "iiiiiiiiiKKOOO" + ''.join([format_of(_extracted_type(ty)) for ty in signature.values()]) + + # generate glue code + folded_without_constexprs = [c for c in ids['ids_of_folded_args'] if c not in ids['ids_of_const_exprs']] + params = [ + i for i in signature.keys() + if i >= desc_start_idx or (i not in constants and i not in folded_without_constexprs) + ] + src = f""" +#include \"cuda.h\" +#include +#include +#include + +static inline void gpuAssert(CUresult code, const char *file, int line) +{{ + if (code != CUDA_SUCCESS) + {{ + const char* prefix = "Triton Error [CUDA]: "; + const char* str; + cuGetErrorString(code, &str); + char err[1024] = {{0}}; + strcat(err, prefix); + strcat(err, str); + PyGILState_STATE gil_state; + gil_state = PyGILState_Ensure(); + PyErr_SetString(PyExc_RuntimeError, err); + PyGILState_Release(gil_state); + }} +}} + +#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} + +typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra); + +static cuLaunchKernelEx_t getLaunchKernelExHandle() {{ + // Open the shared library + void* handle = dlopen("libcuda.so", RTLD_LAZY); + if (!handle) {{ + PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so"); + return NULL; + }} + // Clear any existing error + dlerror(); + cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx"); + // Check for errors + const char *dlsym_error = dlerror(); + if (dlsym_error) {{ + PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so"); + return NULL; + }} + return cuLaunchKernelExHandle; +}} + +static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{ + void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }}; + if (gridX*gridY*gridZ > 0) {{ + if (num_ctas == 1) {{ + CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0)); + }} else {{ + CUlaunchAttribute launchAttr[2]; + launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + launchAttr[0].value.clusterDim.x = clusterDimX; + launchAttr[0].value.clusterDim.y = clusterDimY; + launchAttr[0].value.clusterDim.z = clusterDimZ; + launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE; + launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD; + CUlaunchConfig config; + config.gridDimX = gridX * clusterDimX; + config.gridDimY = gridY * clusterDimY; + config.gridDimZ = gridZ * clusterDimZ; + config.blockDimX = 32 * num_warps; + config.blockDimY = 1; + config.blockDimZ = 1; + config.sharedMemBytes = shared_memory; + config.hStream = stream; + config.attrs = launchAttr; + config.numAttrs = 2; + static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL; + if (cuLaunchKernelExHandle == NULL) {{ + cuLaunchKernelExHandle = getLaunchKernelExHandle(); + }} + CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0)); + }} + }} +}} + +typedef struct _DevicePtrInfo {{ + CUdeviceptr dev_ptr; + bool valid; +}} DevicePtrInfo; + +static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{ + DevicePtrInfo ptr_info; + ptr_info.dev_ptr = 0; + ptr_info.valid = true; + if (PyLong_Check(obj)) {{ + ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj); + return ptr_info; + }} + if (obj == Py_None) {{ + // valid nullptr + return ptr_info; + }} + PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); + if(ptr){{ + PyObject *empty_tuple = PyTuple_New(0); + PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); + Py_DECREF(empty_tuple); + Py_DECREF(ptr); + if (!PyLong_Check(ret)) {{ + PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); + ptr_info.valid = false; + return ptr_info; + }} + ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret); + if(!ptr_info.dev_ptr) + return ptr_info; + uint64_t dev_ptr; + int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr); + if (status == CUDA_ERROR_INVALID_VALUE) {{ + PyErr_Format(PyExc_ValueError, + "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx); + ptr_info.valid = false; + }} + ptr_info.dev_ptr = dev_ptr; + Py_DECREF(ret); // Thanks ChatGPT! + return ptr_info; + }} + PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); + ptr_info.valid = false; + return ptr_info; +}} + +static PyObject* launch(PyObject* self, PyObject* args) {{ + int gridX, gridY, gridZ; + uint64_t _stream; + uint64_t _function; + int num_warps; + int num_ctas; + int clusterDimX; + int clusterDimY; + int clusterDimZ; + int shared_memory; + PyObject *launch_enter_hook = NULL; + PyObject *launch_exit_hook = NULL; + PyObject *compiled_kernel = NULL; + {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])} + if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ, &num_warps, &num_ctas, &clusterDimX, &clusterDimY, &clusterDimZ, &shared_memory, &_stream, &_function, &launch_enter_hook, &launch_exit_hook, &compiled_kernel{', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''})) {{ + return NULL; + }} + + if (launch_enter_hook != Py_None && !PyObject_CallObject(launch_enter_hook, args)) {{ + return NULL; + }} + + + // raise exception asap + {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])}; + Py_BEGIN_ALLOW_THREADS; + _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function{', ' + ', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}"for i, ty in signature.items()) if len(signature) > 0 else ''}); + Py_END_ALLOW_THREADS; + if (PyErr_Occurred()) {{ + return NULL; + }} + + if (launch_exit_hook != Py_None && !PyObject_CallObject(launch_exit_hook, args)) {{ + return NULL; + }} + + // return None + Py_INCREF(Py_None); + return Py_None; +}} + +static PyMethodDef ModuleMethods[] = {{ + {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, + {{NULL, NULL, 0, NULL}} // sentinel +}}; + +static struct PyModuleDef ModuleDef = {{ + PyModuleDef_HEAD_INIT, + \"__triton_launcher\", + NULL, //documentation + -1, //size + ModuleMethods +}}; + +PyMODINIT_FUNC PyInit___triton_launcher(void) {{ + PyObject *m = PyModule_Create(&ModuleDef); + if(m == NULL) {{ + return NULL; + }} + PyModule_AddFunctions(m, ModuleMethods); + return m; +}} +""" + return src diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8137e1fad4276fa87a06f8f6ff01d25075557607 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/core.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/core.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0df6581071476e45dcf4daa9757697439eebb78 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/core.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/math.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/math.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..213668a1483d9d839e2a5d4c02eaa2cfa9ce5d53 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/__pycache__/math.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/extra/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/extra/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2fd0ff3eeee37a0d4f7d44ef36f66b41895ff09f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/extra/__init__.py @@ -0,0 +1,3 @@ +from . import cuda + +__all__ = ['cuda'] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/extra/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/extra/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cc7125c00304cf344e3de696a885bde7d24aff0 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/extra/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/semantic.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/semantic.py new file mode 100644 index 0000000000000000000000000000000000000000..c1ee1036ba6f8097546cfbb9c5076b59183981d9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/language/semantic.py @@ -0,0 +1,1549 @@ +from __future__ import annotations # remove after python 3.11 + +from functools import wraps +from typing import List, Optional, Sequence, Tuple, TypeVar + +from .._C.libtriton.triton import ir +from ..common.build import is_hip +from . import core as tl + +T = TypeVar('T') + + +class IncompatibleTypeErrorImpl(Exception): + + def __init__(self, type_a, type_b): + self.type_a = type_a + self.type_b = type_b + self.message = "invalid operands of type " + self.type_a.__repr__() + " and " + self.type_b.__repr__() + super(IncompatibleTypeErrorImpl, self).__init__(self.message) + + +# ===----------------------------------------------------------------------===## +# Programming Model +# ===----------------------------------------------------------------------===## + + +def program_id(axis: int, builder: ir.builder) -> tl.tensor: + if axis not in (0, 1, 2): + raise ValueError(f"program_id axis must be 0, 1, or 2 but got {axis}") + return tl.tensor(builder.create_get_program_id(axis), tl.int32) + + +def num_programs(axis: int, builder: ir.builder) -> tl.tensor: + if axis not in (0, 1, 2): + raise ValueError(f"num_programs axis must be 0, 1, or 2 but got {axis}") + return tl.tensor(builder.create_get_num_programs(axis), tl.int32) + + +# ===----------------------------------------------------------------------===// +# Implicit Casting Utilities +# ===----------------------------------------------------------------------===// + + +def integer_promote_impl(a_ty: tl.dtype, b_ty: tl.dtype) -> tl.dtype: + a_rank = a_ty.int_bitwidth + b_rank = b_ty.int_bitwidth + a_sn = a_ty.int_signedness + b_sn = b_ty.int_signedness + # Rules for signedness taken from "Usual arithmetic conversions" on + # https://en.cppreference.com/w/c/language/conversion. + if a_sn == b_sn: + return a_ty if a_rank > b_rank else b_ty + elif a_sn == tl.dtype.SIGNEDNESS.UNSIGNED: + return a_ty if a_rank >= b_rank else b_ty + elif b_sn == tl.dtype.SIGNEDNESS.UNSIGNED: + return b_ty if b_rank >= a_rank else a_ty + assert False + + +def computation_type_impl(a_ty: tl.dtype, b_ty: tl.dtype, div_or_mod: bool) -> tl.dtype: + # 1) if one operand is double, the other is implicitly + # converted to double + if a_ty.is_fp64() or b_ty.is_fp64(): + return tl.float64 + # 2) if one operand is float, the other is implicitly + # converted to float + if a_ty.is_fp32() or b_ty.is_fp32(): + return tl.float32 + # 3 ) if one operand is half, the other is implicitly converted to half + # unless we're doing / or %, which do not exist natively in PTX for fp16. + # Supported PTX op: add, sub, mul, fma, neg, abs, min, max, tanh, ex2, setp + if a_ty.is_fp16() or b_ty.is_fp16(): + if div_or_mod: + return tl.float32 + else: + return tl.float16 + # 4) return bf16 only if both operands are of bf16 + if a_ty.is_bf16() or b_ty.is_bf16(): + if div_or_mod: + return tl.float32 + if a_ty.is_bf16() and b_ty.is_bf16(): + return tl.bfloat16 + return tl.float32 + if not a_ty.is_int() or not b_ty.is_int(): + assert False + # 5 ) both operands are integer and undergo + # integer promotion + if div_or_mod and a_ty.int_signedness != b_ty.int_signedness: + raise ValueError("Cannot use /, #, or % with " + a_ty.__repr__() + " and " + b_ty.__repr__() + + " because they have different signedness;" + "this is unlikely to result in a useful answer. Cast them to the same signedness.") + return integer_promote_impl(a_ty, b_ty) + + +# ===----------------------------------------------------------------------===// +# Binary Operators +# ===----------------------------------------------------------------------===// + + +def check_ptr_type_impl(type_a: tl.dtype, type_b: tl.dtype, allow_ptr_a: bool) -> None: + if type_a.is_ptr(): + if not allow_ptr_a: + raise IncompatibleTypeErrorImpl(type_a, type_b) + # T* + U* with T != U + if type_b.is_ptr() and (type_a != type_b): + raise IncompatibleTypeErrorImpl(type_a, type_b) + # T* + float + if type_b.is_floating(): + raise IncompatibleTypeErrorImpl(type_a, type_b) + + +def binary_op_type_checking_impl(lhs: tl.tensor, rhs: tl.tensor, builder: ir.builder, allow_lhs_ptr=False, + allow_rhs_ptr=False, arithmetic_check=True, + div_or_mod=False) -> Tuple[tl.tensor, tl.tensor]: + # implicit broadcasting + lhs, rhs = broadcast_impl_value(lhs, rhs, builder) + # implicit typecasting + lhs_sca_ty = lhs.type.scalar + rhs_sca_ty = rhs.type.scalar + check_ptr_type_impl(lhs_sca_ty, rhs_sca_ty, allow_lhs_ptr) + check_ptr_type_impl(rhs_sca_ty, lhs_sca_ty, allow_rhs_ptr) + if arithmetic_check and not lhs_sca_ty.is_ptr() and not rhs_sca_ty.is_ptr(): + ret_sca_ty = computation_type_impl(lhs_sca_ty, rhs_sca_ty, div_or_mod) + lhs = cast(lhs, ret_sca_ty, builder) + rhs = cast(rhs, ret_sca_ty, builder) + return lhs, rhs + + +def add(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder, True, True) + input_scalar_ty = input.type.scalar + other_scalar_ty = other.type.scalar + if input_scalar_ty.is_ptr() and other_scalar_ty.is_ptr(): + raise ValueError("cannot add pointers together") + + # offset + ptr + # ptr + offset + if other_scalar_ty.is_ptr() and not input_scalar_ty.is_ptr(): + input, other = other, input + input_scalar_ty = input.type.scalar + other_scalar_ty = other.type.scalar + if input_scalar_ty.is_ptr(): + return tl.tensor(builder.create_addptr(input.handle, other.handle), input.type) + # float + float + elif input_scalar_ty.is_floating(): + return tl.tensor(builder.create_fadd(input.handle, other.handle), input.type) + # int + int + elif input_scalar_ty.is_int(): + return tl.tensor(builder.create_add(input.handle, other.handle), input.type) + assert False + + +def sub(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder, True, False) + scalar_ty = input.type.scalar + # ptr - offset + if scalar_ty.is_ptr(): + return tl.tensor(builder.create_addptr(input.handle, minus(other, builder).handle), input.type) + # float - float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fsub(input.handle, other.handle), input.type) + # int - int + elif scalar_ty.is_int(): + return tl.tensor(builder.create_sub(input.handle, other.handle), input.type) + assert False + + +def mul(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float * float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fmul(input.handle, other.handle), input.type) + # * int + elif scalar_ty.is_int(): + return tl.tensor(builder.create_mul(input.handle, other.handle), input.type) + assert False + + +def truediv(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) + input_scalar_ty = input.type.scalar + other_scalar_ty = other.type.scalar + # float / int + if input_scalar_ty.is_floating() and other_scalar_ty.is_int(): + other = cast(other, input_scalar_ty, builder) + # int / float + elif input_scalar_ty.is_int() and other_scalar_ty.is_floating(): + input = cast(input, other_scalar_ty, builder) + # int / int (cast to tl.float32) + elif input_scalar_ty.is_int() and other_scalar_ty.is_int(): + input = cast(input, tl.float32, builder) + other = cast(other, tl.float32, builder) + # float / float (cast to the highest exponent type) + elif input_scalar_ty.is_floating() and other_scalar_ty.is_floating(): + if input_scalar_ty.fp_mantissa_width > other_scalar_ty.fp_mantissa_width: + other = cast(other, input_scalar_ty, builder) + else: + input = cast(input, other_scalar_ty, builder) + # unreachable + else: + assert False + return tl.tensor(builder.create_fdiv(input.handle, other.handle), input.type) + + +def floordiv(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) + input_scalar_ty = input.type.scalar + other_scalar_ty = other.type.scalar + if input_scalar_ty.is_int() and other_scalar_ty.is_int(): + ret_ty = integer_promote_impl(input_scalar_ty, other_scalar_ty) + input = cast(input, ret_ty, builder) + other = cast(other, ret_ty, builder) + if ret_ty.is_int_signed(): + return tl.tensor(builder.create_sdiv(input.handle, other.handle), input.type) + else: + return tl.tensor(builder.create_udiv(input.handle, other.handle), input.type) + assert False + + +def fdiv(input: tl.tensor, other: tl.tensor, ieee_rounding: bool, builder: ir.builder) -> tl.tensor: + input_scalar_ty = input.type.scalar + other_scalar_ty = other.type.scalar + if not input_scalar_ty.is_floating() or not other_scalar_ty.is_floating(): + raise ValueError("both operands of fdiv must have floating scalar type") + input, other = binary_op_type_checking_impl(input, other, builder, False, False, False, True) + ret = builder.create_fdiv(input.handle, other.handle) + return tl.tensor(ret, input.type) + + +def mod(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder, False, False, True, True) + scalar_ty = input.type.scalar + other_scalar_ty = other.type.scalar + # float % float + if scalar_ty.is_floating(): + # input - input.div(other, rounding_mode="floor") * other + ret = sub(input, mul(floor(fdiv(input, other, False, builder), builder), other, builder), builder) + return ret + # % int + elif scalar_ty.is_int(): + if scalar_ty.int_signedness != other_scalar_ty.int_signedness: + raise ValueError("Cannot mod " + scalar_ty.__repr__() + " by " + other_scalar_ty.__repr__() + " " + "because they have different signedness;" + "this is unlikely to result in a useful answer. Cast them to the same signedness.") + if scalar_ty.is_int_signed(): + return tl.tensor(builder.create_srem(input.handle, other.handle), input.type) + else: + return tl.tensor(builder.create_urem(input.handle, other.handle), input.type) + assert False + + +############## +# bitwise ops +############## + + +def bitwise_op_type_checking_impl(input: tl.tensor, other: tl.tensor, + builder: ir.builder) -> Tuple[tl.tensor, tl.tensor]: + input, other = binary_op_type_checking_impl(input, other, builder, False, False, False) + input_sca_ty = input.type.scalar + other_sca_ty = other.type.scalar + if not input_sca_ty.is_int() or not other_sca_ty.is_int(): + raise IncompatibleTypeErrorImpl(input_sca_ty, other_sca_ty) + ret_sca_ty = integer_promote_impl(input_sca_ty, other_sca_ty) + if ret_sca_ty != input_sca_ty: + input = cast(input, ret_sca_ty, builder) + if ret_sca_ty != other_sca_ty: + other = cast(other, ret_sca_ty, builder) + return input, other + + +def and_(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = bitwise_op_type_checking_impl(input, other, builder) + return tl.tensor(builder.create_and(input.handle, other.handle), input.type) + + +def or_(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = bitwise_op_type_checking_impl(input, other, builder) + return tl.tensor(builder.create_or(input.handle, other.handle), input.type) + + +def xor_(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = bitwise_op_type_checking_impl(input, other, builder) + return tl.tensor(builder.create_xor(input.handle, other.handle), input.type) + + +def logical_and(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + if not input.type.is_int1(): + input = bitcast(input, tl.dtype("int1"), builder) + if not other.type.is_int1(): + other = bitcast(other, tl.dtype("int1"), builder) + return and_(input, other, builder) + + +def logical_or(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + if not input.type.is_int1(): + input = bitcast(input, tl.dtype("int1"), builder) + if not other.type.is_int1(): + other = bitcast(other, tl.dtype("int1"), builder) + return or_(input, other, builder) + + +def not_(input: tl.tensor, builder: ir.builder): + if not input.type.is_int1(): + input = bitcast(input, tl.dtype("int1"), builder) + return invert(input, builder) + + +def lshr(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = bitwise_op_type_checking_impl(input, other, builder) + return tl.tensor(builder.create_lshr(input.handle, other.handle), input.type) + + +def ashr(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = bitwise_op_type_checking_impl(input, other, builder) + return tl.tensor(builder.create_ashr(input.handle, other.handle), input.type) + + +def shl(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = bitwise_op_type_checking_impl(input, other, builder) + return tl.tensor(builder.create_shl(input.handle, other.handle), input.type) + + +# ===----------------------------------------------------------------------===// +# Unary Operators +# ===----------------------------------------------------------------------===// + + +def plus(input: tl.tensor) -> tl.tensor: + return input + + +def minus(input: tl.tensor, builder: ir.builder) -> tl.tensor: + input_sca_ty = input.type.scalar + if input_sca_ty.is_ptr(): + raise ValueError("wrong type argument to unary minus (" + input_sca_ty.__repr__() + ")") + _0 = tl.tensor(builder.get_null_value(input_sca_ty.to_ir(builder)), input_sca_ty) + return sub(_0, input, builder) + + +def invert(input: tl.tensor, builder: tl.tensor) -> tl.tensor: + input_sca_ty = input.type.scalar + if input_sca_ty.is_ptr() or input_sca_ty.is_floating(): + raise ValueError("wrong type argument to unary invert (" + input_sca_ty.__repr__() + ")") + _1 = tl.tensor(builder.get_all_ones_value(input_sca_ty.to_ir(builder)), input_sca_ty) + return xor_(input, _1, builder) + + +# ===----------------------------------------------------------------------===// +# Comparison Operators +# ===----------------------------------------------------------------------===// +def _bool_like(v: tl.tensor) -> tl.block_type: + if not v.type.is_block(): + return tl.int1 + shape = v.type.shape + return tl.block_type(tl.int1, shape) + + +def greater_than(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float > float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fcmpOGT(input.handle, other.handle), _bool_like(input)) + # > int + elif scalar_ty.is_int(): + if scalar_ty.is_int_signed(): + return tl.tensor(builder.create_icmpSGT(input.handle, other.handle), _bool_like(input)) + else: + return tl.tensor(builder.create_icmpUGT(input.handle, other.handle), _bool_like(input)) + assert False + + +def greater_equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float >= float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fcmpOGE(input.handle, other.handle), _bool_like(input)) + # >= int + elif scalar_ty.is_int(): + if scalar_ty.is_int_signed(): + return tl.tensor(builder.create_icmpSGE(input.handle, other.handle), _bool_like(input)) + else: + return tl.tensor(builder.create_icmpUGE(input.handle, other.handle), _bool_like(input)) + assert False + + +def less_than(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float < float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fcmpOLT(input.handle, other.handle), _bool_like(input)) + # < int + elif scalar_ty.is_int(): + if scalar_ty.is_int_signed(): + return tl.tensor(builder.create_icmpSLT(input.handle, other.handle), _bool_like(input)) + else: + return tl.tensor(builder.create_icmpULT(input.handle, other.handle), _bool_like(input)) + assert False + + +def less_equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float < float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fcmpOLE(input.handle, other.handle), _bool_like(input)) + # < int + elif scalar_ty.is_int(): + if scalar_ty.is_int_signed(): + return tl.tensor(builder.create_icmpSLE(input.handle, other.handle), _bool_like(input)) + else: + return tl.tensor(builder.create_icmpULE(input.handle, other.handle), _bool_like(input)) + assert False + + +def equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float == float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fcmpOEQ(input.handle, other.handle), _bool_like(input)) + # == int + elif scalar_ty.is_int(): + return tl.tensor(builder.create_icmpEQ(input.handle, other.handle), _bool_like(input)) + assert False + + +def not_equal(input: tl.tensor, other: tl.tensor, builder: ir.builder) -> tl.tensor: + input, other = binary_op_type_checking_impl(input, other, builder) + scalar_ty = input.type.scalar + # float == float + if scalar_ty.is_floating(): + return tl.tensor(builder.create_fcmpUNE(input.handle, other.handle), _bool_like(input)) + # == int + elif scalar_ty.is_int(): + return tl.tensor(builder.create_icmpNE(input.handle, other.handle), _bool_like(input)) + assert False + + +# ===----------------------------------------------------------------------===// +# Block Creation +# ===----------------------------------------------------------------------===// + + +def arange(start: int, end: int, builder: ir.builder) -> tl.tensor: + if not isinstance(start, int) or not isinstance(end, int): + raise ValueError("arange's arguments must be of type tl.constexpr") + is_start_int64 = bool(start >> 32) + is_end_int64 = bool(end >> 32) + if is_start_int64 or is_end_int64: + raise ValueError("arange must fit in int32") + if end <= start: + raise ValueError("arange's end argument must be greater than the start argument") + + shape = [end - start] + ret_ty = tl.block_type(tl.int32, shape) + return tl.tensor(builder.create_make_range(start, end), ret_ty) + + +def full(shape: List[int], value, dtype: tl.dtype, builder: ir.builder) -> tl.tensor: + if isinstance(value, tl.tensor): + assert value.numel.value == 1, "only accepts size-1 tensor" + value = cast(value, dtype, builder) + else: + # scalar + if dtype is None: + raise ValueError("dtype must be specified when value is not a tensor") + if value == 0: + value = builder.get_null_value(dtype.to_ir(builder)) + else: + get_value_fn = getattr(builder, f"get_{dtype.name}") + value = get_value_fn(value) + value = tl.tensor(value, dtype) + + return splat(value, shape, builder) + + +# ===----------------------------------------------------------------------===// +# Shape Manipulation +# ===----------------------------------------------------------------------===// + + +def splat(value: tl.tensor, shape: List[int], builder: ir.builder) -> tl.tensor: + assert not value.type.is_block(), "Cannot splat a block tensor" + if len(shape) == 0: + return value + ret_ty = tl.block_type(value.dtype, shape) + return tl.tensor(builder.create_splat(value.handle, shape), ret_ty) + + +def view(input: tl.tensor, dst_shape: List[int], builder: ir.builder) -> tl.tensor: + numel = 1 + for s in dst_shape: + numel *= s + if input.type.numel != numel: + raise ValueError("cannot view block of different shape") + ret_ty = tl.block_type(input.type.scalar, dst_shape) + return tl.tensor(builder.create_reshape(input.handle, dst_shape, True), ret_ty) + + +def reshape(input: tl.tensor, dst_shape: List[int], builder: ir.builder) -> tl.tensor: + ret_ty = tl.block_type(input.type.scalar, dst_shape) + return tl.tensor(builder.create_reshape(input.handle, dst_shape, False), ret_ty) + + +def expand_dims(input: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor: + dst_shape = [tl._constexpr_to_value(x) for x in input.shape] + dst_shape.insert(axis, 1) + + if not input.type.is_block(): + return splat(input, shape=dst_shape, builder=builder) + + ret_ty = tl.block_type(input.type.scalar, dst_shape) + return tl.tensor(builder.create_expand_dims(input.handle, axis), ret_ty) + + +def cat(lhs: tl.tensor, rhs: tl.tensor, can_reorder: bool, builder: ir.builder) -> tl.tensor: + assert can_reorder, "current implementation of `cat` always may reorder elements" + assert len(lhs.shape) == 1 + ret_type = tl.block_type(lhs.type.scalar, [lhs.shape[0] + rhs.shape[0]]) + return tl.tensor(builder.create_cat(lhs.handle, rhs.handle), ret_type) + + +def trans(input: tl.tensor, builder: ir.builder) -> tl.tensor: + if len(input.shape) != 2: + raise ValueError("Only 2D tensors can be transposed") + ret_type = tl.block_type(input.type.scalar, [input.shape[1], input.shape[0]]) + return tl.tensor(builder.create_trans(input.handle), ret_type) + + +def broadcast_impl_shape(input: tl.tensor, shape: List[int], builder: ir.builder) -> tl.tensor: + if not input.type.is_block(): + ret_ty = tl.block_type(input.type, shape) + return tl.tensor(builder.create_splat(input.handle, shape), ret_ty) + src_shape = input.type.get_block_shapes() + if len(src_shape) != len(shape): + raise ValueError(f"Cannot broadcast, rank mismatch: {src_shape}, {shape}") + if shape == src_shape: + return input + for i, item in enumerate(src_shape): + if shape[i] != item and item != 1: + raise ValueError(f"Cannot broadcast, the expanded size of the tensor ({shape[i]})" + f" must match the existing size ({item}) at non-singleton dimension" + f" {i}: {src_shape}, {shape}") + ret_ty = tl.block_type(input.type.scalar, shape) + return tl.tensor(builder.create_broadcast(input.handle, shape), ret_ty) + + +def broadcast_impl_value(lhs: tl.tensor, rhs: tl.tensor, builder: ir.builder) -> tl.tensor: + lhs_ty = lhs.type + rhs_ty = rhs.type + + # make_shape_compatible(block, scalar) + if lhs_ty.is_block() and not rhs_ty.is_block(): + rhs_ty = tl.block_type(rhs_ty.scalar, lhs_ty.shape) + rhs = tl.tensor(builder.create_splat(rhs.handle, lhs_ty.get_block_shapes()), rhs_ty) + # make_shape_compatible(scalar, block) + elif not lhs_ty.is_block() and rhs_ty.is_block(): + lhs_ty = tl.block_type(lhs_ty.scalar, rhs_ty.shape) + lhs = tl.tensor(builder.create_splat(lhs.handle, rhs_ty.get_block_shapes()), lhs_ty) + # make_shape_compatible(block, block) + elif lhs_ty.is_block() and rhs_ty.is_block(): + lhs_shape = lhs_ty.get_block_shapes() + rhs_shape = rhs_ty.get_block_shapes() + + if len(lhs_shape) < len(rhs_shape): + # Add new axes to lhs + for dim in range(len(lhs_shape), len(rhs_shape)): + lhs = tl.tensor(builder.create_expand_dims(lhs.handle, 0), + tl.block_type(lhs_ty.scalar, [1] + lhs_shape)) + lhs_ty = lhs.type + lhs_shape = lhs_ty.get_block_shapes() + elif len(rhs_shape) < len(lhs_shape): + # Add new axes to rhs + for dim in range(len(rhs_shape), len(lhs_shape)): + rhs = tl.tensor(builder.create_expand_dims(rhs.handle, 0), + tl.block_type(rhs_ty.scalar, [1] + rhs_shape)) + rhs_ty = rhs.type + rhs_shape = rhs_ty.get_block_shapes() + assert len(rhs_shape) == len(lhs_shape) + + ret_shape = [] + for i, left in enumerate(lhs_shape): + right = rhs_shape[i] + if left == 1: + ret_shape.append(right) + elif right == 1: + ret_shape.append(left) + elif left == right: + ret_shape.append(left) + else: + raise ValueError("Cannot make_shape_compatible: incompatible dimensions " + "at index " + str(i) + ": " + str(left) + " and " + str(right)) + if lhs_shape != ret_shape: + ret_ty = tl.block_type(lhs_ty.scalar, ret_shape) + lhs = tl.tensor(builder.create_broadcast(lhs.handle, ret_shape), ret_ty) + if rhs_shape != ret_shape: + ret_ty = tl.block_type(rhs_ty.scalar, ret_shape) + rhs = tl.tensor(builder.create_broadcast(rhs.handle, ret_shape), ret_ty) + # (scalar, scalar) => returns original blocks + return lhs, rhs + + +####### +# cast +####### + + +def bitcast(input: tl.tensor, dst_ty: tl.dtype, builder: ir.builder) -> tl.tensor: + src_ty = input.type + if src_ty.is_block(): + dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes()) + if src_ty == dst_ty: + return input + src_sca_ty = src_ty.scalar + dst_sca_ty = dst_ty.scalar + if src_sca_ty.is_ptr() or dst_sca_ty.is_ptr(): + return cast(input, dst_ty, builder) + # Bitcast + src_bits = src_sca_ty.primitive_bitwidth + dst_bits = dst_sca_ty.primitive_bitwidth + if src_bits != dst_bits: + raise ValueError("Cannot bitcast data-type of size " + str(src_bits) + " to " + "data-type of size " + str(dst_bits)) + return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), dst_ty) + + +def cast(input: tl.tensor, dst_ty: tl.dtype, builder: ir.builder) -> tl.tensor: + src_ty = input.type + if isinstance(dst_ty, tl.constexpr): + dst_ty = dst_ty.value + if src_ty.is_block(): + dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes()) + if src_ty == dst_ty: + return input + + src_sca_ty = src_ty.scalar + dst_sca_ty = dst_ty.scalar + + if (src_sca_ty.is_fp8e4nv() or dst_sca_ty.is_fp8e4nv()): + assert builder.options.allow_fp8e4nv, "fp8e4nv data type is not supported on CUDA arch < 89" + + # Casting with customized floating types involved: fp8 <=> bf16, fp16, fp32, fp64 + if (src_sca_ty.is_fp8() and dst_sca_ty.is_floating()) or \ + (src_sca_ty.is_floating() and dst_sca_ty.is_fp8()): + return tl.tensor(builder.create_fp_to_fp(input.handle, dst_ty.to_ir(builder)), dst_ty) + + # bf16 <=> (not fp32) + if (src_sca_ty.is_fp16() and not dst_sca_ty.is_fp32()) or \ + (src_sca_ty.is_bf16() and not dst_sca_ty.is_fp32()): + return cast(cast(input, tl.float32, builder), dst_sca_ty, builder) + + # Standard floating types' casting: truncation + # fp64 => fp32, fp16, bf16 + # fp32 => fp16, bf16 + truncate_fp = src_sca_ty.is_floating() and \ + dst_sca_ty.is_floating() and \ + src_sca_ty.primitive_bitwidth > dst_sca_ty.primitive_bitwidth + if truncate_fp: + return tl.tensor(builder.create_fp_trunc(input.handle, dst_ty.to_ir(builder)), dst_ty) + + # Standard floating types' casting: extension + # fp32 => fp64 + # fp16 => fp32, fp64 + # bf16 => fp32, fp64 + ext_fp = src_sca_ty.is_floating() and \ + dst_sca_ty.is_floating() and \ + src_sca_ty.primitive_bitwidth < dst_sca_ty.primitive_bitwidth + if ext_fp: + return tl.tensor(builder.create_fp_ext(input.handle, dst_ty.to_ir(builder)), dst_ty) + + # Casting between integer types + if src_sca_ty.is_int() and dst_sca_ty.is_int() and \ + (src_sca_ty.int_bitwidth != dst_sca_ty.int_bitwidth or src_sca_ty.int_signedness != dst_sca_ty.int_signedness): + sign_extend = src_sca_ty.is_int_signed() and not src_sca_ty.is_bool() + if dst_sca_ty.is_bool(): + ty = input.dtype.to_ir(builder) + _0 = tl.tensor(builder.get_null_value(ty), input.dtype) + return not_equal(input, _0, builder) + else: + return tl.tensor(builder.create_int_cast(input.handle, dst_ty.to_ir(builder), sign_extend), dst_ty) + + # Casting standard floating types to integer types + if src_sca_ty.is_standard_floating() and dst_sca_ty.is_int(): + if dst_sca_ty.is_bool(): + ty = input.dtype.to_ir(builder) + _0 = tl.tensor(builder.get_null_value(ty), input.dtype) + return not_equal(input, _0, builder) + elif dst_sca_ty.is_int_signed(): + return tl.tensor(builder.create_fp_to_si(input.handle, dst_ty.to_ir(builder)), dst_ty) + else: + return tl.tensor(builder.create_fp_to_ui(input.handle, dst_ty.to_ir(builder)), dst_ty) + + # Casting integer types to standard floating types + if src_sca_ty.is_int() and dst_sca_ty.is_standard_floating(): + if src_sca_ty.is_bool() or not src_sca_ty.is_int_signed(): + return tl.tensor(builder.create_ui_to_fp(input.handle, dst_ty.to_ir(builder)), dst_ty) + else: + return tl.tensor(builder.create_si_to_fp(input.handle, dst_ty.to_ir(builder)), dst_ty) + + # Casting pointer types to integer types + if src_sca_ty.is_ptr() and dst_sca_ty.is_int(): + bitwidth = dst_sca_ty.int_bitwidth + if bitwidth == 64: + return tl.tensor(builder.create_ptr_to_int(input.handle, dst_ty.to_ir(builder)), dst_ty) + if bitwidth == 1: + return not_equal(cast(input, tl.int64, builder), tl.tensor(builder.get_int64(0), tl.int64), builder) + + # Casting integer types to pointer types + if src_sca_ty.is_int() and dst_sca_ty.is_ptr(): + return tl.tensor(builder.create_int_to_ptr(input.handle, dst_ty.to_ir(builder)), dst_ty) + + # Casting pointer types to pointer types + if src_sca_ty.is_ptr() and dst_sca_ty.is_ptr(): + return tl.tensor(builder.create_bitcast(input.handle, dst_ty.to_ir(builder)), dst_ty) + + assert False, f'cannot cast {input} to {dst_ty}' + + +# ===----------------------------------------------------------------------===// +# Memory Operators +# ===----------------------------------------------------------------------===// + + +def _str_to_load_cache_modifier(cache_modifier): + cache = ir.CACHE_MODIFIER.NONE # default + if cache_modifier: + if cache_modifier == ".ca": + cache = ir.CACHE_MODIFIER.CA + elif cache_modifier == ".cg": + cache = ir.CACHE_MODIFIER.CG + else: + raise ValueError(f"Cache modifier {cache_modifier} not supported") + return cache + + +def _str_to_store_cache_modifier(cache_modifier): + cache = ir.CACHE_MODIFIER.NONE # default + if cache_modifier: + if cache_modifier == ".wb": + cache = ir.CACHE_MODIFIER.WB + elif cache_modifier == ".cg": + cache = ir.CACHE_MODIFIER.CG + elif cache_modifier == ".cs": + cache = ir.CACHE_MODIFIER.CS + elif cache_modifier == ".wt": + cache = ir.CACHE_MODIFIER.WT + else: + raise ValueError(f"Cache modifier {cache_modifier} not supported") + return cache + + +def _str_to_eviction_policy(eviction_policy): + eviction = ir.EVICTION_POLICY.NORMAL # default + if eviction_policy: + if eviction_policy == "evict_last": + eviction = ir.EVICTION_POLICY.EVICT_LAST + elif eviction_policy == "evict_first": + eviction = ir.EVICTION_POLICY.EVICT_FIRST + else: + raise ValueError(f"Eviction policy {eviction_policy} not supported") + return eviction + + +def _str_to_padding_option(padding_option): + padding = None # default + if padding_option: + if padding_option == "zero": + padding = ir.PADDING_OPTION.PAD_ZERO + elif padding_option == "nan": + padding = ir.PADDING_OPTION.PAD_NAN + else: + raise ValueError(f"Padding option {padding_option} not supported") + return padding + + +def _str_to_sem(sem_option): + sem = ir.MEM_SEMANTIC.ACQUIRE_RELEASE + if sem_option: + if sem_option == "acquire": + sem = ir.MEM_SEMANTIC.ACQUIRE + elif sem_option == "release": + sem = ir.MEM_SEMANTIC.RELEASE + elif sem_option == "acq_rel": + sem = ir.MEM_SEMANTIC.ACQUIRE_RELEASE + elif sem_option == "relaxed": + sem = ir.MEM_SEMANTIC.RELAXED + else: + raise ValueError(f"Memory semantic {sem_option} not supported") + return sem + + +def _str_to_scope(scope_option): + scope = ir.MEM_SYNC_SCOPE.GPU + if scope_option: + if scope_option == "gpu": + scope = ir.MEM_SYNC_SCOPE.GPU + elif scope_option == "cta": + scope = ir.MEM_SYNC_SCOPE.CTA + elif scope_option == "sys": + scope = ir.MEM_SYNC_SCOPE.SYSTEM + else: + raise ValueError(f"Memory semantic {scope_option} not supported") + return scope + + +def _canonicalize_boundary_check(boundary_check, block_shape): + if boundary_check: + if not hasattr(boundary_check, "__iter__"): + boundary_check = [boundary_check] + boundary_check = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in boundary_check] + for dim in boundary_check: + assert isinstance(dim, int) and 0 <= dim < len(block_shape) + assert len(boundary_check) > 0 + assert len(boundary_check) == len(set(boundary_check)), "Duplicate dimension in `boundary_check`" + return sorted(boundary_check) + return tuple() + + +def _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder): + # Load by a block pointer: `pointer_type>` + # Block pointer can not have `mask` and `other` arguments + if mask or other: + raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers") + + elt_ty = ptr.type.element_ty.element_ty + assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`" + if elt_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN: + raise ValueError("Padding option `nan` is not supported for integer block pointers") + + # `dst_ty` is de-referenced type of the pointer type + dst_ty = ptr.type.element_ty + + # Check `boundary_check` argument + boundary_check = _canonicalize_boundary_check(boundary_check, dst_ty.get_block_shapes()) + + # Build IR + return tl.tensor( + builder.create_tensor_pointer_load(ptr.handle, boundary_check, padding, cache, eviction, is_volatile), dst_ty) + + +def _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder): + # Load by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` + if not ptr.type.scalar.is_ptr(): + raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.load`") + + # Check `mask`, `other`, `boundary_check`, and `padding` arguments + if not mask and other: + raise ValueError("`other` cannot be provided without `mask`") + if padding or boundary_check: + raise ValueError("`padding_option` or `boundary_check` argument is not supported for loading a tensor of" + "pointers or loading a scalar. Because the compiler does not know the boundary; please " + "use block pointers (defined by `make_block_ptr`) instead") + + # For a pointer of scalar, check the type of `mask` and `other` + if not ptr.type.is_block(): + if mask and mask.type.is_block(): + raise ValueError("Mask argument cannot be block type if pointer argument is not a block") + if other and other.type.is_block(): + raise ValueError("Other argument cannot be block type if pointer argument is not a block") + + # Make `mask` and `other` into the same shape as `ptr` + if ptr.type.is_block(): + if mask: + mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) + if other: + other = broadcast_impl_shape(other, ptr.type.get_block_shapes(), builder) + + # Get `pointer_type` and `elt_ty` + ptr_ty = ptr.type.scalar + elt_ty = ptr_ty.element_ty + + # Treat `pointer_type` as `pointer_type` + if elt_ty == tl.int1: + elt_ty = tl.int8 + ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space) + ptr = cast(ptr, ptr_ty, builder) + + # Cast `other` into `ele_ty` type + if other: + other = cast(other, elt_ty, builder) + + # Create loaded result type `dst_ty` + if ptr.type.is_block(): + shape = ptr.type.get_block_shapes() + dst_ty = tl.block_type(elt_ty, shape) + else: + # Load by de-referencing the pointer of scalar + dst_ty = elt_ty + + # Build IR + if not mask: + return tl.tensor(builder.create_load(ptr.handle, cache, eviction, is_volatile), dst_ty) + else: + return tl.tensor( + builder.create_masked_load(ptr.handle, mask.handle, other.handle if other else None, cache, eviction, + is_volatile), dst_ty) + + +def load(ptr: tl.tensor, mask: Optional[tl.tensor], other: Optional[tl.tensor], boundary_check, padding_option: str, + cache_modifier: str, eviction_policy: str, is_volatile: bool, builder: ir.builder) -> tl.tensor: + # Cache, eviction and padding options + cache = _str_to_load_cache_modifier(cache_modifier) + eviction = _str_to_eviction_policy(eviction_policy) + padding = _str_to_padding_option(padding_option) + + if ptr.type.is_ptr() and ptr.type.element_ty.is_block(): + # Load by a block pointer: `pointer_type>` + return _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder) + else: + # Load by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` + return _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder) + + +def _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builder): + # Store by a block pointer: `pointer_type>` + # Block pointers can not have the `mask` argument + if mask: + raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers") + + # Check same shape and element type + block_shape = ptr.type.element_ty.get_block_shapes() + if not val.type.is_block(): + val = broadcast_impl_shape(val, block_shape, builder) + assert val.type.is_block(), "Value argument must be block type or a scalar" + assert block_shape == val.type.get_block_shapes( + ), f"Block shape({block_shape}) and value shape({val.type.get_block_shapes()}) mismatch" + assert ptr.type.element_ty.element_ty == val.type.element_ty, f"Block element type({ptr.type.element_ty.element_ty}) and value element type({val.type.element_ty}) mismatch" + + elt_ty = ptr.type.element_ty.element_ty + assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`" + + # Check `boundary_check` argument + boundary_check = _canonicalize_boundary_check(boundary_check, block_shape) + + # Build IR + return tl.tensor(builder.create_tensor_pointer_store(ptr.handle, val.handle, boundary_check, cache, eviction), + tl.void) + + +def _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder): + # Store by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` + if not ptr.type.scalar.is_ptr(): + raise ValueError(f"Unsupported ptr type {ptr.type.__repr__()} in `tl.store`") + + # Check `boundary_check` argument + if boundary_check: + raise ValueError("`boundary_check` argument is not supported for storing a tensor of pointers or storing a " + "scalar. Because the compiler does not know the boundary; please use block pointers " + "(defined by `make_block_ptr`) instead") + + # For a pointer of scalar, check the type of `val` and `mask` + if not ptr.type.is_block(): + if val.type.is_block(): + raise ValueError("Value argument cannot be block type if pointer argument is not a block") + if mask and mask.type.is_block(): + raise ValueError("Mask argument cannot be block type if pointer argument is not a block") + + # Make `mask` and `val` into the same shape as `ptr` + if ptr.type.is_block(): + val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder) + if mask: + mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) + + ptr_ty = ptr.type.scalar + elt_ty = ptr_ty.element_ty + + # Treat `pointer_type` as `pointer_type` + if elt_ty == tl.int1: + elt_ty = tl.int8 + ptr_ty = tl.pointer_type(elt_ty, ptr_ty.address_space) + ptr = cast(ptr, ptr_ty, builder) + + # Cast to target data type + val = cast(val, elt_ty, builder) + + # Build IR + if not mask: + return tl.tensor(builder.create_store(ptr.handle, val.handle, cache, eviction), tl.void) + if not mask.type.scalar.is_bool(): + raise ValueError("Mask must have boolean scalar type") + return tl.tensor(builder.create_masked_store(ptr.handle, val.handle, mask.handle, cache, eviction), tl.void) + + +def store(ptr: tl.tensor, val: tl.tensor, mask: Optional[tl.tensor], boundary_check, cache_modifier: str, + eviction_policy: str, builder: ir.builder) -> tl.tensor: + # Cache and eviction options + cache = _str_to_store_cache_modifier(cache_modifier) + eviction = _str_to_eviction_policy(eviction_policy) + + if ptr.type.is_ptr() and ptr.type.element_ty.is_block(): + # Store by a block pointer: `pointer_type>` + return _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builder) + else: + # Store by a tensor of pointers or a pointer of scalar: `block_type>` or `pointer_type<>` + return _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder) + + +######### +# atomic +######### + + +def atomic_cas(ptr: tl.tensor, cmp: tl.tensor, val: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + element_ty = ptr.type.scalar.element_ty + if element_ty.primitive_bitwidth not in [16, 32, 64]: + raise ValueError("atomic_cas only supports elements with width {16, 32, 64}") + return tl.tensor(builder.create_atomic_cas(ptr.handle, cmp.handle, val.handle, sem, scope), val.type) + + +def atom_red_typechecking_impl(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, op: str, + builder: ir.builder) -> Tuple[tl.tensor, tl.tensor, tl.tensor]: + if not ptr.type.scalar.is_ptr(): + raise ValueError("Pointer argument of store instruction is " + ptr.type.__repr__()) + element_ty = ptr.type.scalar.element_ty + if element_ty is tl.float16 and op != 'add': + raise ValueError("atomic_" + op + " does not support fp16") + if element_ty in [tl.int1, tl.int8, tl.int16, tl.bfloat16]: + raise ValueError("atomic_" + op + " does not support " + str(element_ty)) + if ptr.type.is_block(): + if mask: + mask = broadcast_impl_shape(mask, ptr.type.get_block_shapes(), builder) + if val: + val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder) + val = cast(val, ptr.type.scalar.element_ty, builder) + if not mask: + mask_ir = builder.get_int1(True) + mask_ty = tl.int1 + if ptr.type.is_block(): + mask_ir = builder.create_splat(mask_ir, ptr.type.get_block_shapes()) + mask_ty = tl.block_type(tl.int1, ptr.type.get_block_shapes()) + mask = tl.tensor(mask_ir, mask_ty) + return ptr, val, mask + + +def atomic_max(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'max', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + sca_ty = val.type.scalar + # direct call to atomic_max for integers + if sca_ty.is_int(): + if sca_ty.is_int_signed(): + return tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, ptr.handle, val.handle, mask.handle, sem, scope), val.type) + else: + return tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, ptr.handle, val.handle, mask.handle, sem, scope), val.type) + # for float + # return atomic_smax(i_ptr, i_val) if val >= 0 + # return atomic_umin(i_ptr, i_val) if val < 0 + if sca_ty not in {tl.float32, tl.float64}: + raise TypeError(f"atomic_max not supported for dtype {sca_ty}") + + itype = tl.int32 if sca_ty == tl.float32 else tl.float64 + zero = full([], 0.0, sca_ty, builder) + + i_val = bitcast(val, itype, builder) + i_ptr = bitcast(ptr, tl.pointer_type(itype, 1), builder) + pos = greater_equal(val, zero, builder) + neg = less_than(val, zero, builder) + pos_ret = tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.MAX, i_ptr.handle, i_val.handle, + and_(mask, pos, builder).handle, sem, scope), i_val.type) + neg_ret = tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, i_ptr.handle, i_val.handle, + and_(mask, neg, builder).handle, sem, scope), i_val.type) + ret = where(pos, pos_ret, neg_ret, builder) + return bitcast(ret, sca_ty, builder) + + +def atomic_min(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'min', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + sca_ty = val.type.scalar + # direct call to atomic_min for integers + if sca_ty.is_int(): + if sca_ty.is_int_signed(): + return tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, ptr.handle, val.handle, mask.handle, sem, scope), val.type) + else: + return tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.UMIN, ptr.handle, val.handle, mask.handle, sem, scope), val.type) + # for float + # return atomic_smin(i_ptr, i_val) if val >= 0 + # return atomic_umax(i_ptr, i_val) if val < 0 + if sca_ty not in {tl.float32, tl.float64}: + raise TypeError(f"atomic_min not supported for dtype {sca_ty}") + + itype = tl.int32 if sca_ty == tl.float32 else tl.float64 + zero = full([], 0.0, sca_ty, builder) + + i_val = bitcast(val, itype, builder) + i_ptr = bitcast(ptr, tl.pointer_type(itype, 1), builder) + pos = greater_equal(val, zero, builder) + neg = less_than(val, zero, builder) + pos_ret = tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.MIN, i_ptr.handle, i_val.handle, + and_(mask, pos, builder).handle, sem, scope), i_val.type) + neg_ret = tl.tensor( + builder.create_atomic_rmw(ir.ATOMIC_OP.UMAX, i_ptr.handle, i_val.handle, + and_(mask, neg, builder).handle, sem, scope), i_val.type) + ret = where(pos, pos_ret, neg_ret, builder) + return bitcast(ret, sca_ty, builder) + + +def atomic_add(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'add', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + sca_ty = val.type.scalar + op = ir.ATOMIC_OP.FADD if sca_ty.is_floating() else ir.ATOMIC_OP.ADD + return tl.tensor(builder.create_atomic_rmw(op, ptr.handle, val.handle, mask.handle, sem, scope), val.type) + + +def atomic_and(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'and', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.AND, ptr.handle, val.handle, mask.handle, sem, scope), + val.type) + + +def atomic_or(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'or', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.OR, ptr.handle, val.handle, mask.handle, sem, scope), + val.type) + + +def atomic_xor(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xor', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XOR, ptr.handle, val.handle, mask.handle, sem, scope), + val.type) + + +def atomic_xchg(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor, sem: str, scope: str, + builder: ir.builder) -> tl.tensor: + ptr, val, mask = atom_red_typechecking_impl(ptr, val, mask, 'xchg', builder) + sem = _str_to_sem(sem) + scope = _str_to_scope(scope) + return tl.tensor(builder.create_atomic_rmw(ir.ATOMIC_OP.XCHG, ptr.handle, val.handle, mask.handle, sem, scope), + val.type) + + +# ===----------------------------------------------------------------------===// +# Linear Algebra +# ===----------------------------------------------------------------------===// + + +def gpu_has_mfma() -> bool: + if not is_hip(): + return False + return True # mfma supported in ['gfx908', 'gfx90a'] + + +def mfma_supported(M, N, K, allow_tf32, ret_scalar_ty) -> bool: + if not gpu_has_mfma(): + return False + # TODO: Add check for configurations and types. + return True + + +def dot(lhs: tl.tensor, rhs: tl.tensor, acc: tl.tensor, allow_tf32: bool, max_num_imprecise_acc: int, + out_dtype: tl.dtype, builder: ir.builder) -> tl.tensor: + + def assert_dtypes_valid(lhs_dtype, rhs_dtype, options): + if not options.allow_fp8e4nv: + assert not lhs_dtype.is_fp8e4nv() and not rhs_dtype.is_fp8e4nv( + ), "Dot op does not support fp8e4nv on CUDA arch < 90" + if lhs_dtype.is_fp8() and rhs_dtype.is_fp8(): + return + assert lhs_dtype == rhs_dtype, f"First input ({lhs_dtype}) and second input ({rhs_dtype}) must have the same dtype!" + else: + assert not lhs_dtype.is_fp8e4b15() and not rhs_dtype.is_fp8e4b15( + ), "Dot op does not support fp8e4b15 on CUDA arch >= 90" + assert not lhs_dtype.is_fp8e4b15x4() and not rhs_dtype.is_fp8e4b15x4( + ), "Dot op does not support fp8e4b15x4 on CUDA arch >= 90" + if lhs_dtype.is_int() or rhs_dtype.is_int(): + assert lhs_dtype == rhs_dtype, f"Both operands must be same type. First operand ({lhs_dtype}) and second operand ({rhs_dtype})" + assert lhs_dtype.is_int8() or lhs_dtype.is_uint8( + ), f"Both operands must be either int8 or uint8. Operand type ({lhs_dtype})" + elif lhs_dtype.is_fp8() or rhs_dtype.is_fp8(): + assert lhs_dtype.is_fp8e4nv() or lhs_dtype.is_fp8e5( + ), f"Only supports fp8e4nv or fp8e5. First operand ({lhs_dtype})" + assert rhs_dtype.is_fp8e4nv() or rhs_dtype.is_fp8e5( + ), f"Only supports fp8e4nv or fp8e5. Second operand ({rhs_dtype})" + else: + assert lhs_dtype.is_fp16() or lhs_dtype.is_bf16() or lhs_dtype.is_fp32() or lhs_dtype.is_int1( + ), f"Unsupported dtype {lhs_dtype}" + assert rhs_dtype.is_fp16() or rhs_dtype.is_bf16() or rhs_dtype.is_fp32() or rhs_dtype.is_int1( + ), f"Unsupported dtype {rhs_dtype}" + assert lhs_dtype == rhs_dtype, f"First input ({lhs_dtype}) and second input ({rhs_dtype}) must have the same dtype!" + + assert lhs.type.is_block() and rhs.type.is_block() + + assert_dtypes_valid(lhs.dtype, rhs.dtype, builder.options) + + assert len(lhs.shape) == 2, f"First input shape ({lhs.shape}) is not two dimensional!" + assert len(rhs.shape) == 2, f"Second input shape ({rhs.shape}) is not two dimensional!" + assert lhs.shape[1].value == rhs.shape[ + 0].value, f"First input shape ({lhs.shape}) and second input shape {rhs.shape} are not compatible for matmul (second index of first shape ({lhs.shape[1].value}) must be equal to first index of second shape ({rhs.shape[0].value})" + assert lhs.shape[0].value >= 16 and lhs.shape[1].value >= 16 \ + and rhs.shape[1].value >= 16, \ + f"All values in both first input shape ({lhs.shape}) and second input shape ({rhs.shape}) must be >= 16!" + if lhs.type.scalar.is_int(): + assert lhs.type.scalar == tl.int8, "only int8 supported!" + # TODO: This is CUDA specific, check if ROCm has the same limitation + assert lhs.shape[1].value >= 32, "small blocks not supported!" + _0 = builder.get_int32(0) + ret_scalar_ty = tl.int32 + elif out_dtype.is_bf16(): + raise ValueError( + "out_dtype=bfloat16 is unsupported. Please use out_dtype=float32/float16 and cast with `.to(tl.bfloat16)`") + elif lhs.type.scalar.is_fp32() or lhs.type.scalar.is_bf16(): + _0 = builder.get_fp32(0) + ret_scalar_ty = tl.float32 + else: + _0 = builder.get_fp16(0) if out_dtype.is_fp16() else builder.get_fp32(0) + ret_scalar_ty = out_dtype + + M = lhs.type.shape[0] + N = rhs.type.shape[1] + + # Cast operands of types f16 and i8 for configurations where FMA only supported. + if is_hip() and not mfma_supported(M, N, lhs.type.shape[1], allow_tf32, ret_scalar_ty): + ret_cast_scalar_ty = tl.float32 if lhs.type.scalar.is_int() else ret_scalar_ty + lhs = cast(lhs, ret_cast_scalar_ty, builder) + rhs = cast(rhs, ret_cast_scalar_ty, builder) + if ret_cast_scalar_ty == tl.float16: + _0 = builder.create_splat(builder.get_fp16(0), [M, N]) + else: + _0 = builder.create_splat(builder.get_fp32(0), [M, N]) + ret_ty = tl.block_type(ret_cast_scalar_ty, [M, N]) + ret = tl.tensor(builder.create_dot(lhs.handle, rhs.handle, _0, allow_tf32), ret_ty) + return cast(ret, ret_scalar_ty, builder) + if is_hip() and mfma_supported(M, N, lhs.type.shape[1], allow_tf32, + ret_scalar_ty) and ret_scalar_ty.primitive_bitwidth < 32: + if lhs.type.scalar.is_int(): + ret_dot_scalar_ty = tl.int32 + _0 = builder.create_splat(builder.get_int32(0), [M, N]) + else: + ret_dot_scalar_ty = tl.float32 + _0 = builder.create_splat(builder.get_fp32(0), [M, N]) + ret_ty = tl.block_type(ret_dot_scalar_ty, [M, N]) + ret = tl.tensor(builder.create_dot(lhs.handle, rhs.handle, _0, allow_tf32), ret_ty) + return cast(ret, ret_scalar_ty, builder) + ret_ty = tl.block_type(ret_scalar_ty, [M, N]) + if acc is None: + acc_handle = builder.create_splat(_0, [M, N]) + else: + acc_handle = acc.handle + assert acc.type == ret_ty + + # max_num_imprecise_acc only applies to fp8 -> fp32 dot on sm_90 + max_num_imprecise_acc = 0 + if lhs.dtype.is_fp8() and rhs.dtype.is_fp8(): + max_num_imprecise_acc = builder.options.max_num_imprecise_acc_default + if max_num_imprecise_acc is None: + max_num_imprecise_acc = 2**30 + + return tl.tensor(builder.create_dot(lhs.handle, rhs.handle, acc_handle, allow_tf32, max_num_imprecise_acc), ret_ty) + + +# ===----------------------------------------------------------------------===// +# Indexing +# ===----------------------------------------------------------------------===// + + +def where(condition: tl.tensor, x: tl.tensor, y: tl.tensor, builder: ir.builder) -> tl.tensor: + condition = cast(condition, tl.int1, builder) + if condition.type.is_block(): + condition, x = broadcast_impl_value(condition, x, builder) + x, y = broadcast_impl_value(x, y, builder) + condition, x = broadcast_impl_value(condition, x, builder) + + x, y = binary_op_type_checking_impl(x, y, builder, True, True) + if not condition.type.is_block(): + condition, _ = broadcast_impl_value(condition, x, builder) + ret_ty = x.type + return tl.tensor(builder.create_select(condition.handle, x.handle, y.handle), ret_ty) + + +# ===----------------------------------------------------------------------===// +# Reduction +# ===----------------------------------------------------------------------=== + + +def reduction(inputs: Sequence[tl.tensor], axis: int, region_builder_fn, builder: ir.builder) -> Tuple[tl.tensor, ...]: + if axis is None: + new_inputs = [] + for i in range(len(inputs)): + new_shape = [inputs[i].numel.value] + new_inputs.append(view(inputs[i], new_shape, builder)) + inputs = tuple(new_inputs) + axis = 0 + # get result shape + shape = inputs[0].type.shape + ret_shape = [s for i, s in enumerate(shape) if i != axis] + for t in inputs: + assert t.type.shape == shape + + def wrap_tensor(x, scalar_ty): + if ret_shape: + res_ty = tl.block_type(scalar_ty, ret_shape) + else: + # 0d-tensor -> scalar + res_ty = scalar_ty + return tl.tensor(x, res_ty) + + reduce_op = builder.create_reduce([t.handle for t in inputs], axis) + region_builder_fn(reduce_op) + reduce_op.verify() + + return tuple(wrap_tensor(reduce_op.get_result(i), inputs[i].type.scalar) for i in range(len(inputs))) + + +# ===----------------------------------------------------------------------=== +# Associative Scan +# ===----------------------------------------------------------------------=== + + +def associative_scan(inputs: Sequence[tl.tensor], axis: int, region_builder_fn, + builder: ir.builder) -> Tuple[tl.tensor, ...]: + if len(inputs) != 1: + raise ValueError("Current implementation only support single tensor input") + shape = inputs[0].type.shape + + def wrap_tensor(x, scalar_ty): + res_ty = tl.block_type(scalar_ty, shape) + return tl.tensor(x, res_ty) + + scan_op = builder.create_scan([t.handle for t in inputs], axis) + region_builder_fn(scan_op) + scan_op.verify() + + return tuple(wrap_tensor(scan_op.get_result(i), inputs[i].type.scalar) for i in range(len(inputs))) + + +# ===----------------------------------------------------------------------=== +# Math +# ===----------------------------------------------------------------------=== + + +def _check_dtype(dtypes: List[str]) -> T: + """ + We're following libdevice's convention to check accepted data types for math functions. + It is not a good practice to support all data types as accelerators/GPUs don't support + many float16 and bfloat16 math operations. + We should let the users know that they are using and invoke explicit cast to convert + the data type to the supported one. + """ + + def wrapper(fn): + + @wraps(fn) + def check(*args, **kwargs): + # concatenate args and kwargs + all_args = list(args) + list(kwargs.values()) + for arg in [a for a in all_args if isinstance(a, tl.tensor)]: + if arg.type.scalar.name not in dtypes: + raise ValueError(f"Expected dtype {dtypes} but got {arg.type.scalar.name}") + return fn(*args, **kwargs) + + return check + + return wrapper + + +def umulhi(x: tl.tensor, y: tl.tensor, builder: ir.builder) -> tl.tensor: + x, y = binary_op_type_checking_impl(x, y, builder) + # FIXME(Keren): not portable, should be fixed + from . import math + return math.mulhi(x, y, _builder=builder) + + +@_check_dtype(dtypes=["fp32", "fp64"]) +def floor(x: tl.tensor, builder: ir.builder) -> tl.tensor: + # FIXME(Keren): not portable, should be fixed + from . import math + return math.floor(x, _builder=builder) + + +@_check_dtype(dtypes=["fp32", "fp64"]) +def exp(x: tl.tensor, builder: ir.builder) -> tl.tensor: + return tl.tensor(builder.create_exp(x.handle), x.type) + + +@_check_dtype(dtypes=["fp32", "fp64"]) +def log(x: tl.tensor, builder: ir.builder) -> tl.tensor: + return tl.tensor(builder.create_log(x.handle), x.type) + + +@_check_dtype(dtypes=["fp32", "fp64"]) +def cos(x: tl.tensor, builder: ir.builder) -> tl.tensor: + return tl.tensor(builder.create_cos(x.handle), x.type) + + +@_check_dtype(dtypes=["fp32", "fp64"]) +def sin(x: tl.tensor, builder: ir.builder) -> tl.tensor: + return tl.tensor(builder.create_sin(x.handle), x.type) + + +@_check_dtype(dtypes=["fp32", "fp64"]) +def sqrt(x: tl.tensor, builder: ir.builder) -> tl.tensor: + return tl.tensor(builder.create_sqrt(x.handle), x.type) + + +def abs(x: tl.tensor, builder: ir.builder) -> tl.tensor: + dtype = x.dtype + if dtype.is_floating(): + return tl.tensor(builder.create_fabs(x.handle), x.type) + elif dtype.is_int_signed(): + return tl.tensor(builder.create_iabs(x.handle), x.type) + elif dtype.is_int_unsigned(): + return x # no-op + else: + assert False, f"Unexpected dtype {dtype}" + + +## + + +def multiple_of(x: tl.tensor, values: List[int]) -> tl.tensor: + if max(1, len(x.shape)) != len(values): + raise ValueError("Shape of input to multiple_of does not match the length of values") + x.handle.set_attr("tt.divisibility", ir.make_attr(values, x.handle.get_context())) + return x + + +def max_contiguous(x: tl.tensor, values: List[int]) -> tl.tensor: + if len(x.shape) != len(values): + raise ValueError("Shape of input to max_contiguous does not match the length of values") + x.handle.set_attr("tt.contiguity", ir.make_attr(values, x.handle.get_context())) + return x + + +def max_constancy(x: tl.tensor, values: List[int]) -> tl.tensor: + if len(x.shape) != len(values): + raise ValueError("Shape of input to max_constancy does not match the length of values") + x.handle.set_attr("tt.constancy", ir.make_attr(values, x.handle.get_context())) + return x + + +def debug_barrier(builder: ir.builder) -> tl.tensor: + return tl.tensor(builder.create_barrier(), tl.void) + + +def device_print(prefix: str, args: List[tl.tensor], builder: ir.builder) -> tl.tensor: + # It makes sense visually for prefix to end in ": "; make it so. Also, + # non-empty prefixes should start with " ". + if not prefix.endswith(" ") and args: + prefix += " " + if not prefix.endswith(": ") and args: + prefix = prefix[:-1] + ": " + if len(prefix) > 2 and not prefix.startswith(" "): + prefix = " " + prefix + + new_args = [] + for arg in args: + new_args.append(arg.handle) + return tl.tensor(builder.create_print(prefix, new_args), tl.void) + + +def device_assert(cond: tl.tensor, msg: str, file_name: str, func_name, lineno: int, builder: ir.builder) -> tl.tensor: + cond_ty = cond.type + if not cond_ty.is_block(): + cond_ty = tl.block_type(cond_ty.scalar, (1, )) + cond = tl.tensor(builder.create_splat(cond.handle, (1, )), cond_ty) + return tl.tensor(builder.create_assert(cond.handle, msg, file_name, func_name, lineno), tl.void) + + +def _convert_elem_to_ir_value(builder, elem, require_i64): + if isinstance(elem, int): + elem = tl.constexpr(elem) + if isinstance(elem, tl.constexpr): + return builder.get_int64(elem.value) if require_i64 else builder.get_int32(elem.value) + elif isinstance(elem, tl.tensor): + assert elem.numel.value == 1, "Expected a scalar in shape/strides/offsets" + assert elem.dtype.is_int(), "Expected an integer scalar type in shape/strides/offsets" + if elem.dtype != tl.int64 and require_i64: + return builder.create_int_cast(elem.handle, builder.get_int64_ty(), elem.dtype.is_int_signed()) + elif elem.dtype != tl.int32: + return builder.create_int_cast(elem.handle, builder.get_int32_ty(), elem.dtype.is_int_signed()) + return elem.handle + assert False, f"Unsupported element type in shape/strides/offsets: {type(elem)}" + + +def _convert_to_ir_values(builder, list_like, require_i64=True): + if hasattr(list_like, "__iter__"): + return [_convert_elem_to_ir_value(builder, elem, require_i64) for elem in list_like] + return [_convert_elem_to_ir_value(builder, list_like, require_i64)] + + +def make_block_ptr(base: tl.tensor, shape, strides, offsets, block_shape, order, builder: ir.builder) -> tl.tensor: + # Convert dynamic arguments to IR values + # NOTES(Chenggang): current `shape/strides` are `int64_t`, while `offsets/block_shape` are `int32_t` + shape = _convert_to_ir_values(builder, shape) + strides = _convert_to_ir_values(builder, strides) + offsets = _convert_to_ir_values(builder, offsets, require_i64=False) + + # Check `base` type + if not base.type.is_ptr() or base.type.element_ty.is_block(): + raise ValueError("Expected `base` to be a pointer type (but not a block pointer type or others)") + + # Treat `pointer_type` as `pointer_type` + if base.type.element_ty == tl.int1: + base = cast(base, tl.pointer_type(tl.int8, base.type.address_space), builder) + + # Check whether `block_shape` is static + if not hasattr(block_shape, "__iter__"): + block_shape = [block_shape] + block_shape = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in block_shape] + assert all([isinstance(elem, int) and -2**31 <= elem < 2**31 for elem in block_shape]), \ + "Expected a list of constant integers (`int32_t` range) in `block_shape`" + + # Check `order` + if not hasattr(order, "__iter__"): + order = [order] + order = [elem.value if isinstance(elem, tl.constexpr) else elem for elem in order] + assert sorted(order) == list(range(len(order))), "Expected a permutation of (0, 1, ..., len(order)-1) in order" + + # Must have same length + assert all([len(block_shape) == len(list_like) for list_like in [shape, strides, offsets, order]]), \ + "Expected shape/strides/offsets/block_shape to have the same length" + + # Build value, the type is: + # `pointer_type>` in Python + # `tt.ptr>` in MLIR + handle = builder.create_make_block_ptr(base.handle, shape, strides, offsets, block_shape, order) + return tl.tensor(handle, tl.pointer_type(tl.block_type(base.type.element_ty, block_shape))) + + +def advance(base: tl.tensor, offsets, builder: ir.builder) -> tl.tensor: + # Convert dynamic offsets to IR values + offsets = _convert_to_ir_values(builder, offsets, require_i64=False) + + # Advanced block pointer type is the same as before + return tl.tensor(builder.create_advance(base.handle, offsets), base.type) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ceec8b56a00ba5ad66682aa3f8cdd7015a304a4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__init__.py @@ -0,0 +1,14 @@ +# from .conv import _conv, conv +from . import blocksparse +from .cross_entropy import _cross_entropy, cross_entropy +from .flash_attention import attention +from .matmul import _matmul, matmul + +__all__ = [ + "blocksparse", + "_cross_entropy", + "cross_entropy", + "_matmul", + "matmul", + "attention", +] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/cross_entropy.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/cross_entropy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5cc5393d509ab9032565c0aa87e5b12ea360d2a Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/cross_entropy.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/matmul.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/matmul.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f42f6d11f0a4aa564d693b284a3bba49f1604f88 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/matmul.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/matmul_perf_model.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/matmul_perf_model.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7ce6cb4360c8ac39b322d443e9bdf4f338fed39 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/__pycache__/matmul_perf_model.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/__pycache__/matmul.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/__pycache__/matmul.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..348576903b96f69219e9066076d6c9c42103ada1 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/__pycache__/matmul.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/__pycache__/softmax.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/__pycache__/softmax.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..782ab1079ac49a196af11e2085f8285809d07c13 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/__pycache__/softmax.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/softmax.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..bcffff26bb515aa65f9e199d9e7c3b36bc6fe1c3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/blocksparse/softmax.py @@ -0,0 +1,228 @@ +import torch + +from ... import jit +from ... import language as tl +from ... import next_power_of_2 + + +def num_warps(n): + if n <= 128: + return 1 + if n <= 256: + return 2 + if n <= 512: + return 4 + if n <= 4096: + return 8 + return 16 + + +@jit +def _blocksparse_softmax_fwd(Out, A, stride_xz, LUT, # + R, extent, stride_zr, stride_hr, # relative attention + scale, is_causal, # + ROW_SIZE: tl.constexpr, # + BLOCK_SIZE: tl.constexpr, # + IS_DENSE: tl.constexpr # + ): + h = tl.program_id(0) + m = tl.program_id(1) + z = tl.program_id(2) + # create index ranges + hm = h * tl.num_programs(1) + m + lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE + block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE + # extract information from LUT + header = LUT + (hm // BLOCK_SIZE) * 2 + size = tl.load(header + 0) + offset = tl.load(header + 1) + # pointer offset + off_a = z * stride_xz + off_a += (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE # block indx + off_a += (m % BLOCK_SIZE) * BLOCK_SIZE # row indx + # do not need to read column indices in the dense case + if IS_DENSE: + ns = tl.arange(0, ROW_SIZE) + else: + off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE + start_n = tl.load(LUT + off_lut + block_n, mask=block_n < size, other=0) + ns = start_n * BLOCK_SIZE + lane_n + # load X + mask = block_n < size + a = tl.load(A + off_a + lane_n, mask=mask, other=-float("inf")) + a = a.to(tl.float32) + # compute + out = a + out *= scale + # apply relative attention + if R is not None: + R += z * stride_zr + R += h * stride_hr + off_lo = (extent - m - 1) + ns + mask_lo = (off_lo >= 0) & (off_lo < extent) + rel_logits = tl.load(R + m * extent + off_lo, mask=mask_lo, other=0.0) + out += rel_logits + out = out.to(tl.float32) + # apply causal mask + out = tl.where((ns > m) & is_causal, -float("inf"), out) + # computation + out = tl.softmax(out) + # write-back + tl.store(Out + off_a + lane_n, out, mask=mask) + + +@jit +def _blocksparse_softmax_bwd(DA, stride_zdx, # + DOut, stride_zdout, # + Out, stride_zout, # + scale, # + LUT, # + DR, extent, stride_zr, stride_hr, stride_er, # + is_causal, # + ROW_SIZE: tl.constexpr, # + BLOCK_SIZE: tl.constexpr, # + IS_DENSE: tl.constexpr): + h = tl.program_id(0) + m = tl.program_id(1) + z = tl.program_id(2) + # create index ranges + hm = h * tl.num_programs(1) + m + lane_n = tl.arange(0, ROW_SIZE) % BLOCK_SIZE + block_n = tl.arange(0, ROW_SIZE) // BLOCK_SIZE + # extract information from LUT + header = LUT + (hm // BLOCK_SIZE) * 2 + size = tl.load(header + 0) + offset = tl.load(header + 1) + # row-col offset + off_mn = (offset + block_n) * BLOCK_SIZE * BLOCK_SIZE + off_mn += (m % BLOCK_SIZE) * BLOCK_SIZE + mask = block_n < size + # pointers + As = Out + z * stride_zout + off_mn + DOuts = DOut + z * stride_zdout + off_mn + # do not need to read column indices in the dense case + if IS_DENSE: + ns = tl.arange(0, ROW_SIZE) + else: + off_lut = offset + 2 * tl.num_programs(0) * tl.num_programs(1) // BLOCK_SIZE + start_n = tl.load(LUT + off_lut + block_n, mask=mask, other=0) + ns = start_n * BLOCK_SIZE + lane_n + # load data + a = tl.load(As + lane_n, mask=mask, other=0.0) + a = a.to(tl.float32) + dout = tl.load(DOuts + lane_n, mask=mask, other=0.0) + dout = dout.to(tl.float32) + # compute + a = tl.where((ns > m) & is_causal & (a == a), 0., a) + da = a * (dout - tl.sum(a * dout, 0)) + # apply relative attention + if DR is not None: + DR += z * stride_zr + DR += h * stride_hr + off_lo = (extent - m - 1) + ns + mask_lo = (off_lo >= 0) & (off_lo < extent) & mask + tl.store(DR + m * extent + off_lo, da, mask=mask_lo) + da = da * scale + # convert da + # write-back + DAs = DA + z * stride_zdx + off_mn + tl.store(DAs + lane_n, da, mask=mask) + + +class _softmax(torch.autograd.Function): + + @staticmethod + def make_lut(layout, block, device): + _empty = torch.tensor([], dtype=torch.int64, device=layout.device) + sizes = _empty.clone() + # sizes along rows + for h in range(layout.shape[0]): + sizes = torch.cat((sizes, layout[h, :, :].sum(-1))) + total_sizes = sizes * block + # offsets in block format + offsets = torch.zeros_like(sizes) + offsets[1:] = torch.cumsum(sizes[:-1], dim=0) + # block indices + columns = layout.nonzero(as_tuple=False)[:, 2] + header = torch.stack((sizes, offsets), dim=1).view(-1) + lut = torch.cat((header, columns)).type(torch.int32).to(device) + return lut, int(total_sizes.max()) + + @staticmethod + def forward(ctx, a, scale, rel_logits, is_causal, spdims, block, lut, maxlut, is_dense): + if scale is not None and isinstance(scale, torch.Tensor): + assert scale.device.type == "cpu" + scale = scale.item() + M = a.shape[0] + grid = [spdims[0], spdims[1] * block, M] + rel_shape = (1, 1, 1, 1) if rel_logits is None else rel_logits.shape + rel_strides = (1, 1, 1, 1) if rel_logits is None else rel_logits.stride() + # enqueue kernel + out = torch.empty_like(a) + _blocksparse_softmax_fwd[grid]( + out, a, a.stride(0), lut, # + rel_logits, rel_shape[-1], rel_strides[0], rel_strides[1], # relative attn# + scale, # + is_causal, # + BLOCK_SIZE=block, # + ROW_SIZE=next_power_of_2(maxlut), # + IS_DENSE=is_dense, # + num_warps=num_warps(maxlut) # + ) + # save to context + # ctx.mark_dirty(x) + ctx.save_for_backward(out, lut) + ctx.spdims = spdims + ctx.block = block + ctx.maxlut = maxlut + ctx.scale = scale + ctx.rel_shape = rel_shape + ctx.rel_strides = rel_strides + ctx.rel_dtype = a.dtype + ctx.is_dense = is_dense + ctx.is_causal = is_causal + return out + + @staticmethod + def backward(ctx, dout): + # retrieve from context + out, lut = ctx.saved_tensors + # relative logits gradients + dr = None + if ctx.needs_input_grad[3]: + dr = torch.zeros(ctx.rel_shape, dtype=ctx.rel_dtype, device=out.device) + # run kernel + M = out.shape[0] + grid = (ctx.spdims[0], ctx.spdims[1] * ctx.block, M) + da = torch.empty_like(dout) + _blocksparse_softmax_bwd[grid]( + da, da.stride(0), # + dout, dout.stride(0), # + out, out.stride(0), # + ctx.scale, # + lut, # + dr, ctx.rel_shape[-1], ctx.rel_strides[0], ctx.rel_strides[1], ctx.rel_strides[2], # + ctx.is_causal, # + BLOCK_SIZE=ctx.block, # + ROW_SIZE=next_power_of_2(ctx.maxlut), # + IS_DENSE=ctx.is_dense, # + num_warps=num_warps(ctx.maxlut) # + ) + return (da, None, None, dr, None, None, None, None, None, None, None, None, None, None, None, None, None, None) + + +class softmax: + + def __init__(self, layout, block, device, is_dense=False): + self.spdims = layout.shape + self.layout = layout + self.block = block + self.lut, self.maxlut = _softmax.make_lut(self.layout, self.block, device) + self.is_dense = is_dense + + def __call__(self, a, *, scale=1.0, rel_logits=None, is_causal=False): + if rel_logits is not None and rel_logits.dtype != a.dtype: + raise ValueError(f"relative position embedding must be {a.dtype}") + a = _softmax.apply(a, scale, rel_logits, is_causal, self.spdims, self.block, self.lut, self.maxlut, + self.is_dense) + return a diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/matmul.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/matmul.py new file mode 100644 index 0000000000000000000000000000000000000000..832e52727f092f5f543730b16406c3c9d7f05d80 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/ops/matmul.py @@ -0,0 +1,204 @@ +import torch + +from .. import Config, autotune, cdiv, heuristics, jit +from .. import language as tl +from .matmul_perf_model import early_config_prune, estimate_matmul_time + +_ordered_datatypes = [torch.float16, torch.bfloat16, torch.float32] + + +def get_higher_dtype(a, b): + if a is b: + return a + + assert a in _ordered_datatypes + assert b in _ordered_datatypes + + for d in _ordered_datatypes: + if a is d: + return b + if b is d: + return a + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, + num_stages=num_stages, num_warps=num_warps)) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append( + Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@autotune( + configs=[ + # basic configs for compute-bound matmuls + Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + # good for int8 + Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), + Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), + Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + ] + get_configs_io_bound(), + key=['M', 'N', 'K'], + prune_configs_by={ + 'early_config_prune': early_config_prune, + 'perf_model': estimate_matmul_time, + 'top_k': 10, + }, +) +@heuristics({ + 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, +}) +@jit +def _kernel(A, B, C, M, N, K, # + stride_am, stride_ak, # + stride_bk, stride_bn, # + stride_cm, stride_cn, # + dot_out_dtype: tl.constexpr, # + allow_tf32: tl.constexpr, # + fp8_fast_accum: tl.constexpr, # + BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, # + GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, AB_DTYPE: tl.constexpr # + ): + # matrix multiplication + pid = tl.program_id(0) + pid_z = tl.program_id(1) + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + # do matrix multiplication + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + # pointers + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=dot_out_dtype) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + k_remaining = K - k * (BLOCK_K * SPLIT_K) + _0 = tl.zeros((1, 1), dtype=C.dtype.element_ty) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=_0) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=_0) + if AB_DTYPE: + a = a.to(C.dtype.element_ty) + b = b.to(C.dtype.element_ty) + if fp8_fast_accum: + acc = tl.dot(a, b, acc, out_dtype=dot_out_dtype, allow_tf32=allow_tf32) + else: + acc += tl.dot(a, b, out_dtype=dot_out_dtype, allow_tf32=allow_tf32) + A += BLOCK_K * SPLIT_K * stride_ak + B += BLOCK_K * SPLIT_K * stride_bk + acc = acc.to(C.dtype.element_ty) + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn) + mask = (rm < M)[:, None] & (rn < N)[None, :] + # handles write-back with reduction-splitting + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +class _matmul(torch.autograd.Function): + kernel = _kernel + + _locks = {} + + @staticmethod + def _call(a, b, dot_out_dtype, allow_tf32, fp8_fast_accum): + device = a.device + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0], "incompatible dimensions" + M, K = a.shape + _, N = b.shape + # allocates output + if a.dtype in [tl.float8e4nv, tl.float8e4b15, tl.float8e5] or\ + b.dtype in [tl.float8e4nv, tl.float8e4b15, tl.float8e5]: + c_dtype = torch.float16 + elif a.dtype in [torch.int8] or b.dtype in [torch.int8]: + c_dtype = torch.int32 + else: + c_dtype = get_higher_dtype(a.dtype, b.dtype) + c = torch.empty((M, N), device=device, dtype=c_dtype) + if dot_out_dtype is None: + if c_dtype in [torch.float16, torch.float32, torch.bfloat16]: + dot_out_dtype = tl.float32 + else: + dot_out_dtype = tl.int32 + else: + assert isinstance(dot_out_dtype, torch.dtype), "dot_out_dtype must be a torch.dtype" + if dot_out_dtype == torch.float16: + dot_out_dtype = tl.float16 + elif dot_out_dtype in [torch.float32, torch.bfloat16]: + dot_out_dtype = tl.float32 + else: + dot_out_dtype = tl.int32 + ab_dtype = True + if a.dtype in [tl.float8e4nv, tl.float8e5] and b.dtype in [tl.float8e4nv, tl.float8e5]: + ab_dtype = False + if a.dtype in [torch.int8] and b.dtype in [torch.int8]: + ab_dtype = False + # launch kernel + grid = lambda META: (cdiv(M, META['BLOCK_M']) * cdiv(N, META['BLOCK_N']), META['SPLIT_K']) + _kernel[grid]( + a, b, c, M, N, K, # + a.stride(0), a.stride(1), # + b.stride(0), b.stride(1), # + c.stride(0), c.stride(1), # + dot_out_dtype=dot_out_dtype, # + allow_tf32=allow_tf32, # + fp8_fast_accum=fp8_fast_accum, # + GROUP_M=8, AB_DTYPE=ab_dtype) + return c + + @staticmethod + def forward(ctx, a, b, dot_out_dtype=None, allow_tf32=True, fp8_fast_accum=True): + return _matmul._call(a, b, dot_out_dtype=dot_out_dtype, allow_tf32=allow_tf32, fp8_fast_accum=fp8_fast_accum) + + +matmul = _matmul.apply diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/runtime/autotuner.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/runtime/autotuner.py new file mode 100644 index 0000000000000000000000000000000000000000..f46c5d30c2dc3622a0bf6d5a550dea4177f75a57 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/runtime/autotuner.py @@ -0,0 +1,328 @@ +from __future__ import annotations + +import builtins +import time +from typing import Dict + +from ..testing import do_bench +from .jit import KernelInterface + + +class OutOfResources(Exception): + + def __init__(self, required, limit, name): + self.message = (f"out of resource: {name}, Required: {required}, Hardware limit: {limit}. " + + "Reducing block sizes or `num_stages` may help.") + self.required = required + self.limit = limit + self.name = name + super().__init__(self.message) + + def __reduce__(self): + # this is necessary to make CompilationError picklable + return (type(self), (self.required, self.limit, self.name)) + + +class Autotuner(KernelInterface): + + def __init__( + self, + fn, + arg_names, + configs, + key, + reset_to_zero, + restore_value, + prune_configs_by: Dict = None, + warmup=25, + rep=100, + ): + """ + :param prune_configs_by: a dict of functions that are used to prune configs, fields: + 'perf_model': performance model used to predicate running time with different configs, returns running time + 'top_k': number of configs to bench + 'prune_num_stages_by'(optional): a function used to prune num_stages. It takes configs:List[Config] as its input, and returns pruned configs. + """ + if not configs: + self.configs = [Config({}, num_warps=4, num_stages=2, num_ctas=1)] + else: + self.configs = configs + self.key_idx = [arg_names.index(k) for k in key] + self.cache = {} + self.arg_names = arg_names + + # Reset to zero or restore values + self.reset_idx = [] + if reset_to_zero is not None: + self.reset_idx = [arg_names.index(k) for k in reset_to_zero] + self.restore_idx = [] + if restore_value is not None: + self.restore_idx = [arg_names.index(k) for k in restore_value] + + # Hook to reset or restore for required tensors + self.pre_hook = lambda args, reset_only=False: 0 + self.post_hook = lambda args: 0 + if len(self.reset_idx) > 0 or len(self.restore_idx) > 0: + + def _pre_hook(args, reset_only=False): + for i in self.reset_idx: + args[i].zero_() + if not reset_only: + self.restore_copies = [args[i].clone() for i in self.restore_idx] + + self.pre_hook = _pre_hook + if len(self.restore_idx) > 0: + + def _post_hook(args): + for i, j in enumerate(self.restore_idx): + args[j].copy_(self.restore_copies[i]) + self.restore_copies = [] + + self.post_hook = _post_hook + + self.perf_model = None + self.configs_top_k = 1.0 + self.early_config_prune = None + if prune_configs_by: + self.perf_model = prune_configs_by.get("perf_model", self.perf_model) + self.configs_top_k = prune_configs_by.get("top_k", self.configs_top_k) + self.early_config_prune = prune_configs_by.get("early_config_prune", self.early_config_prune) + + self.fn = fn + self.warmup = warmup + self.rep = rep + + def _bench(self, *args, config, **meta): + # check for conflicts, i.e. meta-parameters both provided + # as kwargs and by the autotuner + conflicts = meta.keys() & config.kwargs.keys() + if conflicts: + raise ValueError(f"Conflicting meta-parameters: {', '.join(conflicts)}." + " Make sure that you don't re-define auto-tuned symbols.") + # augment meta-parameters with tunable ones + current = dict(meta, **config.kwargs) + full_nargs = {**self.nargs, **current} + + def kernel_call(): + if config.pre_hook: + config.pre_hook(full_nargs) + self.pre_hook(args) + self.fn.run( + *args, + num_warps=config.num_warps, + num_stages=config.num_stages, + num_ctas=config.num_ctas, + enable_warp_specialization=config.enable_warp_specialization, + # enable_persistent=False, + **current, + ) + self.post_hook(args) + + try: + return do_bench(kernel_call, warmup=self.warmup, rep=self.rep, quantiles=(0.5, 0.2, 0.8)) + except OutOfResources: + return [float("inf"), float("inf"), float("inf")] + + def run(self, *args, **kwargs): + self.nargs = dict(zip(self.arg_names, args)) + if len(self.configs) > 1: + all_args = {**self.nargs, **kwargs} + _args = [] + for name in self.arg_names: + if name in all_args: + _args.append(all_args[name]) + key = [_args[i] for i in self.key_idx] + for arg in _args: + if hasattr(arg, "dtype"): + key.append(str(arg.dtype)) + key = tuple(key) + if key not in self.cache: + # prune configs + pruned_configs = self.prune_configs(kwargs) + bench_start = time.time() + timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} + bench_end = time.time() + self.bench_time = bench_end - bench_start + self.cache[key] = builtins.min(timings, key=timings.get) + self.pre_hook(args, reset_only=True) + self.configs_timings = timings + config = self.cache[key] + else: + config = self.configs[0] + self.best_config = config + full_nargs = {**self.nargs, **kwargs, **self.best_config.kwargs} + if config.pre_hook is not None: + config.pre_hook(full_nargs) + ret = self.fn.run( + *args, + num_warps=config.num_warps, + num_stages=config.num_stages, + num_ctas=config.num_ctas, + enable_warp_specialization=config.enable_warp_specialization, + **kwargs, + **config.kwargs, + ) + self.nargs = None + return ret + + def prune_configs(self, kwargs): + pruned_configs = self.configs + if self.early_config_prune: + pruned_configs = self.early_config_prune(self.configs, self.nargs) + if self.perf_model: + top_k = self.configs_top_k + if isinstance(top_k, float) and top_k <= 1.0: + top_k = int(len(self.configs) * top_k) + if len(pruned_configs) > top_k: + est_timing = { + config: + self.perf_model( + **self.nargs, + **kwargs, + **config.kwargs, + num_stages=config.num_stages, + num_warps=config.num_warps, + num_ctas=config.num_ctas, + enable_warp_specialization=config.enable_warp_specialization, + enable_persistent=config.enable_persistent, + ) + for config in pruned_configs + } + pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k] + return pruned_configs + + def warmup(self, *args, **kwargs): + self.nargs = dict(zip(self.arg_names, args)) + for config in self.prune_configs(kwargs): + self.fn.warmup( + *args, + num_warps=config.num_warps, + num_ctas=config.num_ctas, + num_stages=config.num_stages, + enable_warp_specialization=config.enable_warp_specialization, + enable_persistent=config.enable_persistent, + **kwargs, + **config.kwargs, + ) + self.nargs = None + + +class Config: + """ + An object that represents a possible kernel configuration for the auto-tuner to try. + + :ivar meta: a dictionary of meta-parameters to pass to the kernel as keyword arguments. + :type meta: dict[Str, Any] + :ivar num_warps: the number of warps to use for the kernel when compiled for GPUs. For example, if + `num_warps=8`, then each kernel instance will be automatically parallelized to + cooperatively execute using `8 * 32 = 256` threads. + :type num_warps: int + :ivar num_stages: the number of stages that the compiler should use when software-pipelining loops. + Mostly useful for matrix multiplication workloads on SM80+ GPUs. + :type enable_warp_specialization: bool + :ivar enable_warp_specialization: enable specialization (spatial partitioning) or not. See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#spatial-partitioning-also-known-as-warp-specialization + :ivar pre_hook: a function that will be called before the kernel is called. Parameters of this + function are args. + """ + + def __init__(self, kwargs, num_warps=4, num_stages=2, num_ctas=1, enable_warp_specialization=False, pre_hook=None): + self.kwargs = kwargs + self.num_warps = num_warps + self.num_ctas = num_ctas + self.num_stages = num_stages + self.enable_warp_specialization = enable_warp_specialization + # TODO[shuhaoj]: May make enable_persistent configurable in future if necessary. + self.enable_persistent = False + self.pre_hook = pre_hook + + def __str__(self): + res = [] + for k, v in self.kwargs.items(): + res.append(f"{k}: {v}") + res.append(f"num_warps: {self.num_warps}") + res.append(f"num_ctas: {self.num_ctas}") + res.append(f"num_stages: {self.num_stages}") + res.append(f"enable_warp_specialization: {self.enable_warp_specialization}") + res.append(f"enable_persistent: {self.enable_persistent}") + return ", ".join(res) + + +def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, restore_value=None, warmup=25, rep=100): + """ + Decorator for auto-tuning a :code:`triton.jit`'d function. + + .. highlight:: python + .. code-block:: python + + @triton.autotune(configs=[ + triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4), + triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8), + ], + key=['x_size'] # the two above configs will be evaluated anytime + # the value of x_size changes + ) + @triton.jit + def kernel(x_ptr, x_size, **META): + BLOCK_SIZE = META['BLOCK_SIZE'] + :note: When all the configurations are evaluated, the kernel will run multiple times. + This means that whatever value the kernel updates will be updated multiple times. + To avoid this undesired behavior, you can use the `reset_to_zero` argument, which + resets the value of the provided tensor to `zero` before running any configuration. + :param configs: a list of :code:`triton.Config` objects + :type configs: list[triton.Config] + :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs. + :type key: list[str] + :param prune_configs_by: a dict of functions that are used to prune configs, fields: + 'perf_model': performance model used to predicate running time with different configs, returns running time + 'top_k': number of configs to bench + 'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It takes configs:List[Config] as its input, and returns pruned configs. + :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs. + :type reset_to_zero: list[str] + :param restore_value: a list of argument names whose value will be restored after evaluating any configs. + :type restore_value: list[str] + :param warmup: Warmup time (in ms) to pass to benchmarking, defaults to 25. + :type warmup: int + :param rep: Repetition time (in ms) to pass to benchmarking, defaults to 100. + :type rep: int + """ + + def decorator(fn): + return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, restore_value, prune_configs_by, warmup, rep) + + return decorator + + +class Heuristics(KernelInterface): + + def __init__(self, fn, arg_names, values) -> None: + self.fn = fn + self.values = values + self.arg_names = arg_names + + def run(self, *args, **kwargs): + for v, heur in self.values.items(): + kwargs[v] = heur({**dict(zip(self.arg_names, args)), **kwargs}) + return self.fn.run(*args, **kwargs) + + +def heuristics(values): + """ + Decorator for specifying how the values of certain meta-parameters may be computed. + This is useful for cases where auto-tuning is prohibitevely expensive, or just not applicable. + + .. highlight:: python + .. code-block:: python + + @triton.heuristics(values={'BLOCK_SIZE': lambda args: 2 ** int(math.ceil(math.log2(args[1])))}) + @triton.jit + def kernel(x_ptr, x_size, **META): + BLOCK_SIZE = META['BLOCK_SIZE'] # smallest power-of-two >= x_size + :param values: a dictionary of meta-parameter names and functions that compute the value of the meta-parameter. + each such function takes a list of positional arguments as input. + :type values: dict[str, Callable[[list[Any]], Any]] + """ + + def decorator(fn): + return Heuristics(fn, fn.arg_names, values) + + return decorator diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/runtime/errors.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/runtime/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..a5d69aba685ec55c85853c7044724ae991ec5131 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/runtime/errors.py @@ -0,0 +1,13 @@ +class OutOfResources(Exception): + + def __init__(self, required, limit, name): + self.message = f"out of resource: {name}, " f"Required: {required}, " f"Hardware limit: {limit}" + self.message += ". Reducing block sizes or `num_stages` may help." + self.required = required + self.limit = limit + self.name = name + super().__init__(self.message) + + def __reduce__(self): + # this is necessary to make CompilationError picklable + return (type(self), (self.required, self.limit, self.name)) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/third_party/cuda/include/cuda.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/third_party/cuda/include/cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..ec111be805af301204fb9041f76e9da5b5cf211c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/third_party/cuda/include/cuda.h @@ -0,0 +1,22119 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __cuda_cuda_h__ +#define __cuda_cuda_h__ + + + +#include +#ifdef _MSC_VER +typedef unsigned __int32 cuuint32_t; +typedef unsigned __int64 cuuint64_t; +#else +#include +typedef uint32_t cuuint32_t; +typedef uint64_t cuuint64_t; +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#if defined(CUDA_FORCE_API_VERSION) +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __CUDA_API_PER_THREAD_DEFAULT_STREAM + #define __CUDA_API_PTDS(api) api ## _ptds + #define __CUDA_API_PTSZ(api) api ## _ptsz +#else + #define __CUDA_API_PTDS(api) api + #define __CUDA_API_PTSZ(api) api +#endif + +#define cuDeviceTotalMem cuDeviceTotalMem_v2 +#define cuCtxCreate cuCtxCreate_v2 +#define cuCtxCreate_v3 cuCtxCreate_v3 +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +#define cuMemGetInfo cuMemGetInfo_v2 +#define cuMemAlloc cuMemAlloc_v2 +#define cuMemAllocPitch cuMemAllocPitch_v2 +#define cuMemFree cuMemFree_v2 +#define cuMemGetAddressRange cuMemGetAddressRange_v2 +#define cuMemAllocHost cuMemAllocHost_v2 +#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 +#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) +#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) +#define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) +#define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) +#define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) +#define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) +#define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) +#define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) +#define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) +#define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) +#define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) +#define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) +#define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) +#define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) +#define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) +#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) +#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) +#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) +#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) +#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) +#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) +#define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) +#define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) +#define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) +#define cuArrayCreate cuArrayCreate_v2 +#define cuArrayGetDescriptor cuArrayGetDescriptor_v2 +#define cuArray3DCreate cuArray3DCreate_v2 +#define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 +#define cuTexRefSetAddress cuTexRefSetAddress_v2 +#define cuTexRefGetAddress cuTexRefGetAddress_v2 +#define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 +#define cuCtxDestroy cuCtxDestroy_v2 +#define cuCtxPopCurrent cuCtxPopCurrent_v2 +#define cuCtxPushCurrent cuCtxPushCurrent_v2 +#define cuStreamDestroy cuStreamDestroy_v2 +#define cuEventDestroy cuEventDestroy_v2 +#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 +#define cuLinkCreate cuLinkCreate_v2 +#define cuLinkAddData cuLinkAddData_v2 +#define cuLinkAddFile cuLinkAddFile_v2 +#define cuMemHostRegister cuMemHostRegister_v2 +#define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 +#define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +#define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 +#define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 +#define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 + +#define cuGraphInstantiate cuGraphInstantiateWithFlags + +#define cuGraphExecUpdate cuGraphExecUpdate_v2 +#define cuGetProcAddress cuGetProcAddress_v2 +#define cuGraphAddKernelNode cuGraphAddKernelNode_v2 +#define cuGraphKernelNodeGetParams cuGraphKernelNodeGetParams_v2 +#define cuGraphKernelNodeSetParams cuGraphKernelNodeSetParams_v2 +#define cuGraphExecKernelNodeSetParams cuGraphExecKernelNodeSetParams_v2 + +#define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32_v2) +#define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32_v2) +#define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64_v2) +#define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64_v2) +#define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp_v2) +#define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) +#define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) + +#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) + #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) + #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) + #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) + #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) + #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) + #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) + + #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) + #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) + #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) + #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) + #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) + #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) + + #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) + #define cuStreamGetId __CUDA_API_PTSZ(cuStreamGetId) + #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) + #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) + #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) + #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) + #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) + #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) + #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) + #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) + #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) + #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) + #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) + #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) + #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) + #define cuLaunchKernelEx __CUDA_API_PTSZ(cuLaunchKernelEx) + #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) + #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) + #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) + + #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) + + #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) + #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) + + #define cuGraphInstantiateWithParams __CUDA_API_PTSZ(cuGraphInstantiateWithParams) + #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) + #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) + #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) + #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) + #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) + #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) + + #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) + #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) + #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) +#endif + +/** + * \file cuda.h + * \brief Header file for the CUDA Toolkit application programming interface. + * + * \file cudaGL.h + * \brief Header file for the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. + * + * \file cudaD3D9.h + * \brief Header file for the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_TYPES Data types used by CUDA driver + * @{ + */ + +/** + * CUDA API version number + */ +#define CUDA_VERSION 12010 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * CUDA device pointer + * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. + */ +#if defined(_WIN64) || defined(__LP64__) +typedef unsigned long long CUdeviceptr_v2; +#else +typedef unsigned int CUdeviceptr_v2; +#endif +typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ + +typedef int CUdevice_v1; /**< CUDA device */ +typedef CUdevice_v1 CUdevice; /**< CUDA device */ +typedef struct CUctx_st *CUcontext; /**< CUDA context */ +typedef struct CUmod_st *CUmodule; /**< CUDA module */ +typedef struct CUfunc_st *CUfunction; /**< CUDA function */ +typedef struct CUlib_st *CUlibrary; /**< CUDA library */ +typedef struct CUkern_st *CUkernel; /**< CUDA kernel */ +typedef struct CUarray_st *CUarray; /**< CUDA array */ +typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ +typedef struct CUevent_st *CUevent; /**< CUDA event */ +typedef struct CUstream_st *CUstream; /**< CUDA stream */ +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ +typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ +typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ +typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ +typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ +typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ +typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ +typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ +typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ +typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ +typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ +typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ + +#ifndef CU_UUID_HAS_BEEN_DEFINED +#define CU_UUID_HAS_BEEN_DEFINED +typedef struct CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +} CUuuid; +#endif + +/** + * CUDA IPC handle size + */ +#define CU_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef struct CUipcEventHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcEventHandle_v1; +typedef CUipcEventHandle_v1 CUipcEventHandle; + +/** + * CUDA IPC mem handle + */ +typedef struct CUipcMemHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcMemHandle_v1; +typedef CUipcMemHandle_v1 CUipcMemHandle; + +/** + * CUDA Ipc Mem Flags + */ +typedef enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ +} CUipcMem_flags; + + +/** + * CUDA Mem Attach Flags + */ +typedef enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ + CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ + CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ +} CUmemAttach_flags; + +/** + * Context creation flags + */ +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling + * \deprecated This flag was deprecated as of CUDA 4.0 + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ + CU_CTX_SCHED_MASK = 0x07, + CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 + * and it no longer has any effect. All contexts + * as of CUDA 3.2 behave as though the flag is enabled. */ + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ + CU_CTX_COREDUMP_ENABLE = 0x20, /**< Trigger coredumps from exceptions in this context */ + CU_CTX_USER_COREDUMP_ENABLE= 0x40, /**< Enable user pipe to trigger coredumps in this context */ + CU_CTX_SYNC_MEMOPS = 0x80, /**< Force synchronous blocking on cudaMemcpy/cudaMemset */ + CU_CTX_FLAGS_MASK = 0xFF +} CUctx_flags; + +/** + * Event sched flags + */ +typedef enum CUevent_sched_flags_enum { + CU_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ +} CUevent_sched_flags; + +/** + * NVCL event scheduling flags + */ +typedef enum cl_event_flags_enum { + NVCL_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + NVCL_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + NVCL_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + NVCL_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ +} cl_event_flags; + +/** + * NVCL context scheduling flags + */ +typedef enum cl_context_flags_enum { + NVCL_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + NVCL_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + NVCL_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + NVCL_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ +} cl_context_flags; + + +/** + * Stream creation flags + */ +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ + CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +} CUstream_flags; + +/** + * Legacy stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_LEGACY ((CUstream)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_PER_THREAD ((CUstream)0x2) + +/** + * Event creation flags + */ +typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ +} CUevent_flags; + +/** + * Event record flags + */ +typedef enum CUevent_record_flags_enum { + CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ + CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node + * instead of the default behavior. This flag is invalid + * when used outside of capture. */ +} CUevent_record_flags; + +/** + * Event wait flags + */ +typedef enum CUevent_wait_flags_enum { + CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ + CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node + * instead of the default behavior. This flag is invalid + * when used outside of capture.*/ +} CUevent_wait_flags; + +/** + * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 + */ +typedef enum CUstreamWaitValue_flags_enum { + CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit + values). Note this is a cyclic comparison which ignores wraparound. + (Default behavior.) */ + CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ + CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ + CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be + queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ + CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This + means that, if a remote write operation is guaranteed to have reached the + device before the wait can be satisfied, that write is guaranteed to be + visible to downstream device work. The device is permitted to reorder + remote writes internally. For example, this flag would be required if + two remote writes arrive in a defined order, the wait is satisfied by the + second write, and downstream work needs to observe the first write. + Support for this operation is restricted to selected platforms and can be + queried with ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.*/ +} CUstreamWaitValue_flags; + +/** + * Flags for ::cuStreamWriteValue32 + */ +typedef enum CUstreamWriteValue_flags_enum { + CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ + CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued + before it, as a performance optimization. Normally, + ::cuStreamWriteValue32 will provide a memory fence before the + write, which has similar semantics to + __threadfence_system() but is scoped to the stream + rather than a CUDA thread. + This flag is not supported in the v2 API. */ +} CUstreamWriteValue_flags; + +/** + * Operations for ::cuStreamBatchMemOp + */ +typedef enum CUstreamBatchMemOpType_enum { + CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ + CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ + CU_STREAM_MEM_OP_BARRIER = 6, /**< Insert a memory barrier of the specified type */ + CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a + standalone operation. */ +} CUstreamBatchMemOpType; + +/** + * Flags for ::cuStreamMemoryBarrier + */ +typedef enum CUstreamMemoryBarrier_flags_enum { + CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0x0, /**< System-wide memory barrier. */ + CU_STREAM_MEMORY_BARRIER_TYPE_GPU = 0x1 /**< Limit memory barrier scope to the GPU. */ +} CUstreamMemoryBarrier_flags; + +/** + * Per-operation parameters for ::cuStreamBatchMemOp + */ +typedef union CUstreamBatchMemOpParams_union { + CUstreamBatchMemOpType operation; + struct CUstreamMemOpWaitValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } waitValue; + struct CUstreamMemOpWriteValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } writeValue; + struct CUstreamMemOpFlushRemoteWritesParams_st { + CUstreamBatchMemOpType operation; + unsigned int flags; + } flushRemoteWrites; + struct CUstreamMemOpMemoryBarrierParams_st { /**< Only supported in the _v2 API */ + CUstreamBatchMemOpType operation; + unsigned int flags; + } memoryBarrier; + cuuint64_t pad[6]; +} CUstreamBatchMemOpParams_v1; +typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; + +typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_st { + CUcontext ctx; + unsigned int count; + CUstreamBatchMemOpParams *paramArray; + unsigned int flags; +} CUDA_BATCH_MEM_OP_NODE_PARAMS; + +/** + * Occupancy calculator flag + */ +typedef enum CUoccupancy_flags_enum { + CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ + CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ +} CUoccupancy_flags; + +/** + * Flags for ::cuStreamUpdateCaptureDependencies + */ +typedef enum CUstreamUpdateCaptureDependencies_flags_enum { + CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ + CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ +} CUstreamUpdateCaptureDependencies_flags; + +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ + CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */ + CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ + CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ + CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ + CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ + CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ + CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ + CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ + CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ + CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ + CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ + CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ +} CUarray_format; + +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; + +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; + +/** + * Device properties + */ +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1 = 92, /**< Deprecated, along with v1 MemOps API, ::cuStreamBatchMemOp and related APIs are supported. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 = 93, /**< Deprecated, along with v1 MemOps API, 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 = 94, /**< Deprecated, along with v1 MemOps API, ::CU_STREAM_WAIT_VALUE_NOR is supported. */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ + CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ + CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ + CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ + CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ + CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ + CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ + CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ + CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ + CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ + CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ + CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ + CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, /**< Indicates device supports cluster launch */ + CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 122, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related MemOp APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 123, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs. */ + CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, /**< Device supports buffer sharing with dma_buf mechanism. */ + CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED = 125, /**< Device supports IPC Events. */ + CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT = 126, /**< Number of memory domains the device supports. */ + CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127, /**< Device supports accessing memory using Tensor Map. */ + CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129, /**< Device supports unified function pointers. */ + CU_DEVICE_ATTRIBUTE_IS_NUMA_NODE = 130, + CU_DEVICE_ATTRIBUTE_NUMA_ID = 131, + CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132, /**< Device supports switch multicast and reduction operations. */ + CU_DEVICE_ATTRIBUTE_MAX +} CUdevice_attribute; + +/** + * Legacy device properties + */ +typedef struct CUdevprop_st { + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int totalConstantMemory; /**< Constant memory available on device in bytes */ + int SIMDWidth; /**< Warp size in threads */ + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int regsPerBlock; /**< 32-bit registers available per block */ + int clockRate; /**< Clock frequency in kilohertz */ + int textureAlign; /**< Alignment requirement for textures */ +} CUdevprop_v1; +typedef CUdevprop_v1 CUdevprop; + +/** + * Pointer information + */ +typedef enum CUpointer_attribute_enum { + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ + CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ + CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ + CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ + CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ + CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ + CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ + CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ + CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ + CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ + , + CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18, /**< Size of the actual underlying mapping that the pointer belongs to **/ + CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19, /**< The start address of the mapping that the pointer belongs to **/ + CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20 /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/ +} CUpointer_attribute; + +/** + * Function properties + */ +typedef enum CUfunction_attribute_enum { + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + + /** + * The size in bytes of statically-allocated shared memory required by + * this function. This does not include dynamically-allocated shared + * memory requested by the user at runtime. + */ + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + + /** + * The size in bytes of local memory used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + + /** + * The number of registers used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + * Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + */ + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. Note that + * this will return a value of 10 for legacy cubins that do not have a + * properly-encoded binary architecture version. + */ + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set . + */ + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + + /** + * The maximum size in bytes of dynamically-allocated shared memory that can be used by + * this function. If the user-specified dynamic shared memory size is larger than this + * value, the launch will fail. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. + * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + + /** + * If this attribute is set, the kernel must launch with a valid cluster + * size specified. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10, + + /** + * The required cluster width in blocks. The values must either all be 0 or + * all be positive. The validity of the cluster dimensions is otherwise + * checked at launch time. + * + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11, + + /** + * The required cluster height in blocks. The values must either all be 0 or + * all be positive. The validity of the cluster dimensions is otherwise + * checked at launch time. + * + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12, + + /** + * The required cluster depth in blocks. The values must either all be 0 or + * all be positive. The validity of the cluster dimensions is otherwise + * checked at launch time. + * + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13, + + /** + * Whether the function can be launched with non-portable cluster size. 1 is + * allowed, 0 is disallowed. A non-portable cluster size may only function + * on the specific SKUs the program is tested on. The launch might fail if + * the program is run on a different hardware platform. + * + * CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking + * whether the desired size can be launched on the current device. + * + * Portable Cluster Size + * + * A portable cluster size is guaranteed to be functional on all compute + * capabilities higher than the target compute capability. The portable + * cluster size for sm_90 is 8 blocks per cluster. This value may increase + * for future compute capabilities. + * + * The specific hardware unit may support higher cluster sizes that’s not + * guaranteed to be portable. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14, + + /** + * The block scheduling policy of a function. The value type is + * CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. + * See ::cuFuncSetAttribute, ::cuKernelSetAttribute + */ + CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15, + + CU_FUNC_ATTRIBUTE_MAX +} CUfunction_attribute; + +/** + * Function cache configurations + */ +typedef enum CUfunc_cache_enum { + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ +} CUfunc_cache; + +/** + * Shared memory configurations + */ +typedef enum CUsharedconfig_enum { + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ +} CUsharedconfig; + +/** + * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute + */ +typedef enum CUshared_carveout_enum { + CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ + CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ + CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ +} CUshared_carveout; + +/** + * Memory types + */ +typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ +} CUmemorytype; + +/** + * Compute Modes + */ +typedef enum CUcomputemode_enum { + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ +} CUcomputemode; + +/** + * Memory advise values + */ +typedef enum CUmem_advise_enum { + CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occasionally be written to */ + CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ + CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ + CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +} CUmem_advise; + +typedef enum CUmem_range_attribute_enum { + CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occasionally be written to */ + CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ + CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ + CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ +} CUmem_range_attribute; + +/** + * Online compiler and linker options + */ +typedef enum CUjit_option_enum +{ + /** + * Max number of registers that a thread may use.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_MAX_REGISTERS = 0, + + /** + * IN: Specifies minimum number of threads per block to target compilation + * for\n + * OUT: Returns the number of threads the compiler actually targeted. + * This restricts the resource utilization of the compiler (e.g. max + * registers) such that a block with the given number of threads should be + * able to launch based on register limitations. Note, this option does not + * currently take into account any other resource limitations, such as + * shared memory utilization.\n + * Cannot be combined with ::CU_JIT_TARGET.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_THREADS_PER_BLOCK = 1, + + /** + * Overwrites the option value with the total wall clock time, in + * milliseconds, spent in the compiler and linker\n + * Option type: float\n + * Applies to: compiler and linker + */ + CU_JIT_WALL_TIME = 2, + + /** + * Pointer to a buffer in which to print any log messages + * that are informational in nature (the buffer size is specified via + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER = 3, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4, + + /** + * Pointer to a buffer in which to print any log messages that + * reflect errors (the buffer size is specified via option + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER = 5, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6, + + /** + * Level of optimizations to apply to generated code (0 - 4), with 4 + * being the default and highest level of optimizations.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_OPTIMIZATION_LEVEL = 7, + + /** + * No option value required. Determines the target based on the current + * attached context (default)\n + * Option type: No option value needed\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET_FROM_CUCONTEXT = 8, + + /** + * Target is chosen based on supplied ::CUjit_target. Cannot be + * combined with ::CU_JIT_THREADS_PER_BLOCK.\n + * Option type: unsigned int for enumerated type ::CUjit_target\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET = 9, + + /** + * Specifies choice of fallback strategy if matching cubin is not found. + * Choice is based on supplied ::CUjit_fallback. This option cannot be + * used with cuLink* APIs as the linker requires exact matches.\n + * Option type: unsigned int for enumerated type ::CUjit_fallback\n + * Applies to: compiler only + */ + CU_JIT_FALLBACK_STRATEGY = 10, + + /** + * Specifies whether to create debug information in output (-g) + * (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_GENERATE_DEBUG_INFO = 11, + + /** + * Generate verbose log messages (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LOG_VERBOSE = 12, + + /** + * Generate line number information (-lineinfo) (0: false, default)\n + * Option type: int\n + * Applies to: compiler only + */ + CU_JIT_GENERATE_LINE_INFO = 13, + + /** + * Specifies whether to enable caching explicitly (-dlcm) \n + * Choice is based on supplied ::CUjit_cacheMode_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n + * Applies to: compiler only + */ + CU_JIT_CACHE_MODE = 14, + + /** + * \deprecated + * This jit option is deprecated and should not be used. + */ + CU_JIT_NEW_SM3X_OPT = 15, + + /** + * This jit option is used for internal purpose only. + */ + CU_JIT_FAST_COMPILE = 16, + + /** + * Array of device symbol names that will be relocated to the corresponding + * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * When loading a device module, driver will relocate all encountered + * unresolved symbols to the host addresses.\n + * It is only allowed to register symbols that correspond to unresolved + * global variables.\n + * It is illegal to register the same device symbol at multiple addresses.\n + * Option type: const char **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_NAMES = 17, + + /** + * Array of host addresses that will be used to relocate corresponding + * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * Option type: void **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_ADDRESSES = 18, + + /** + * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and + * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n + * Option type: unsigned int\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_COUNT = 19, + + /** + * \deprecated + * Enable link-time optimization (-dlto) for device code (Disabled by default).\n + * This option is not supported on 32-bit platforms.\n + * Option type: int\n + * Applies to: compiler and linker + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_LTO = 20, + + /** + * \deprecated + * Control single-precision denormals (-ftz) support (0: false, default). + * 1 : flushes denormal values to zero + * 0 : preserves denormal values + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_FTZ = 21, + + /** + * \deprecated + * Control single-precision floating-point division and reciprocals + * (-prec-div) support (1: true, default). + * 1 : Enables the IEEE round-to-nearest mode + * 0 : Enables the fast approximation mode + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_PREC_DIV = 22, + + /** + * \deprecated + * Control single-precision floating-point square root + * (-prec-sqrt) support (1: true, default). + * 1 : Enables the IEEE round-to-nearest mode + * 0 : Enables the fast approximation mode + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_PREC_SQRT = 23, + + /** + * \deprecated + * Enable/Disable the contraction of floating-point multiplies + * and adds/subtracts into floating-point multiply-add (-fma) + * operations (1: Enable, default; 0: Disable). + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_FMA = 24, + + /** + * \deprecated + * Array of kernel names that should be preserved at link time while others + * can be removed.\n + * Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n + * Note that kernel names can be mangled by the compiler in which case the + * mangled name needs to be specified.\n + * Wildcard "*" can be used to represent zero or more characters instead of + * specifying the full or mangled name.\n + * It is important to note that the wildcard "*" is also added implicitly. + * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + * thus preserve all kernels with those names. This can be avoided by providing + * a more specific name like "barfoobaz".\n + * Option type: const char **\n + * Applies to: dynamic linker only + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_REFERENCED_KERNEL_NAMES = 25, + + /** + * \deprecated + * Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n + * Option type: unsigned int\n + * Applies to: dynamic linker only + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_REFERENCED_KERNEL_COUNT = 26, + + /** + * \deprecated + * Array of variable names (__device__ and/or __constant__) that should be + * preserved at link time while others can be removed.\n + * Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n + * Note that variable names can be mangled by the compiler in which case the + * mangled name needs to be specified.\n + * Wildcard "*" can be used to represent zero or more characters instead of + * specifying the full or mangled name.\n + * It is important to note that the wildcard "*" is also added implicitly. + * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + * thus preserve all variables with those names. This can be avoided by providing + * a more specific name like "barfoobaz".\n + * Option type: const char **\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_REFERENCED_VARIABLE_NAMES = 27, + + /** + * \deprecated + * Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n + * Option type: unsigned int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_REFERENCED_VARIABLE_COUNT = 28, + + /** + * \deprecated + * This option serves as a hint to enable the JIT compiler/linker + * to remove constant (__constant__) and device (__device__) variables + * unreferenced in device code (Disabled by default).\n + * Note that host references to constant and device variables using APIs like + * ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless + * the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES = 29, + + /** + * Generate position independent code (0: false)\n + * Option type: int\n + * Applies to: compiler only + */ + CU_JIT_POSITION_INDEPENDENT_CODE = 30, + + CU_JIT_NUM_OPTIONS + +} CUjit_option; + +/* + * Indicates that compute device class supports accelerated features. + */ +#define CU_COMPUTE_ACCELERATED_TARGET_BASE 0x10000 + +/** + * Online compilation targets + */ +typedef enum CUjit_target_enum +{ + CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ + CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ + CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ + CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ + CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ + CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ + CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ + CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ + CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ + CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ + CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ + CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ + CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ + CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ + CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/ + CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/ + CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/ + CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/ + + /**< Compute device class 9.0. with accelerated features.*/ + CU_TARGET_COMPUTE_90A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_90 +} CUjit_target; + +/** + * Cubin matching fallback strategies + */ +typedef enum CUjit_fallback_enum +{ + CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ + + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ + +} CUjit_fallback; + +/** + * Caching modes for dlcm + */ +typedef enum CUjit_cacheMode_enum +{ + CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ + CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ + CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ +} CUjit_cacheMode; + +/** + * Device code formats + */ +typedef enum CUjitInputType_enum +{ + /** + * Compiled device-class-specific device code\n + * Applicable options: none + */ + CU_JIT_INPUT_CUBIN = 0, + + /** + * PTX source code\n + * Applicable options: PTX compiler options + */ + CU_JIT_INPUT_PTX = 1, + + /** + * Bundle of multiple cubins and/or PTX of some device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_FATBINARY = 2, + + /** + * Host object with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_OBJECT = 3, + + /** + * Archive of host objects with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_LIBRARY = 4, + + /** + * \deprecated + * High-level intermediate code for link-time optimization\n + * Applicable options: NVVM compiler options, PTX compiler options + * + * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + */ + CU_JIT_INPUT_NVVM = 5, + + CU_JIT_NUM_INPUT_TYPES = 6 +} CUjitInputType; + +typedef struct CUlinkState_st *CUlinkState; + +/** + * Flags to register a graphics resource + */ +typedef enum CUgraphicsRegisterFlags_enum { + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 +} CUgraphicsRegisterFlags; + +/** + * Flags for mapping and unmapping interop resources + */ +typedef enum CUgraphicsMapResourceFlags_enum { + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 +} CUgraphicsMapResourceFlags; + +/** + * Array indices for cube faces + */ +typedef enum CUarray_cubemap_face_enum { + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ +} CUarray_cubemap_face; + +/** + * Limits + */ +typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ + CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ + CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ + CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ + CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ + CU_LIMIT_MAX +} CUlimit; + +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resource */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; + +#ifdef _WIN32 +#define CUDA_CB __stdcall +#else +#define CUDA_CB +#endif + +/** + * CUDA host function + * \param userData Argument value passed to the function + */ +typedef void (CUDA_CB *CUhostFn)(void *userData); + +/** + * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. + */ +typedef enum CUaccessProperty_enum { + CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ + CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ + CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ +} CUaccessProperty; + +/** + * Specifies an access policy for a window, a contiguous extent of memory + * beginning at base_ptr and ending at base_ptr + num_bytes. + * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. + * Partition into many segments and assign segments such that: + * sum of "hit segments" / window == approx. ratio. + * sum of "miss segments" / window == approx 1-ratio. + * Segments and ratio specifications are fitted to the capabilities of + * the architecture. + * Accesses in a hit segment apply the hitProp access policy. + * Accesses in a miss segment apply the missProp access policy. + */ +typedef struct CUaccessPolicyWindow_st { + void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ + size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ + float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ + CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ + CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ +} CUaccessPolicyWindow_v1; +/** + * Access policy window + */ +typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; + +/** + * GPU kernel node parameters + */ +typedef struct CUDA_KERNEL_NODE_PARAMS_st { + CUfunction func; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to kernel parameters */ + void **extra; /**< Extra options */ +} CUDA_KERNEL_NODE_PARAMS_v1; + +/** + * GPU kernel node parameters + */typedef struct CUDA_KERNEL_NODE_PARAMS_v2_st { + CUfunction func; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to kernel parameters */ + void **extra; /**< Extra options */ + CUkernel kern; /**< Kernel to launch, will only be referenced if func is NULL */ + CUcontext ctx; /**< Context for the kernel task to run in. The value NULL will indicate the current context should be used by the api. This field is ignored if func is set. */ +} CUDA_KERNEL_NODE_PARAMS_v2; +typedef CUDA_KERNEL_NODE_PARAMS_v2 CUDA_KERNEL_NODE_PARAMS; + +/** + * Memset node parameters + */ +typedef struct CUDA_MEMSET_NODE_PARAMS_st { + CUdeviceptr dst; /**< Destination device pointer */ + size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ + unsigned int value; /**< Value to be set */ + unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ + size_t width; /**< Width of the row in elements */ + size_t height; /**< Number of rows */ +} CUDA_MEMSET_NODE_PARAMS_v1; +typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; + +/** + * Host node parameters + */ +typedef struct CUDA_HOST_NODE_PARAMS_st { + CUhostFn fn; /**< The function to call when the node executes */ + void* userData; /**< Argument to pass to the function */ +} CUDA_HOST_NODE_PARAMS_v1; +typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; + +/** + * Graph node types + */ +typedef enum CUgraphNodeType_enum { + CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ + CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ + CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ + CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ + CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ + CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ + CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ + CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ + CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ + CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ + CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ + CU_GRAPH_NODE_TYPE_MEM_FREE = 11,/**< Memory Free Node */ + CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 /**< Batch MemOp Node */ +} CUgraphNodeType; + +/** + * Graph instantiation results +*/ +typedef enum CUgraphInstantiateResult_enum +{ + CUDA_GRAPH_INSTANTIATE_SUCCESS = 0, /**< Instantiation succeeded */ + CUDA_GRAPH_INSTANTIATE_ERROR = 1, /**< Instantiation failed for an unexpected reason which is described in the return value of the function */ + CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2, /**< Instantiation failed due to invalid structure, such as cycles */ + CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3, /**< Instantiation for device launch failed because the graph contained an unsupported operation */ + CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4 /**< Instantiation for device launch failed due to the nodes belonging to different contexts */ +} CUgraphInstantiateResult; + +/** + * Graph instantiation parameters + */ +typedef struct CUDA_GRAPH_INSTANTIATE_PARAMS_st +{ + cuuint64_t flags; /**< Instantiation flags */ + CUstream hUploadStream; /**< Upload stream */ + CUgraphNode hErrNode_out; /**< The node which caused instantiation to fail, if any */ + CUgraphInstantiateResult result_out; /**< Whether instantiation was successful. If it failed, the reason why */ +} CUDA_GRAPH_INSTANTIATE_PARAMS; + +typedef enum CUsynchronizationPolicy_enum { + CU_SYNC_POLICY_AUTO = 1, + CU_SYNC_POLICY_SPIN = 2, + CU_SYNC_POLICY_YIELD = 3, + CU_SYNC_POLICY_BLOCKING_SYNC = 4 +} CUsynchronizationPolicy; + +/** + * Cluster scheduling policies. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute + */ +typedef enum CUclusterSchedulingPolicy_enum { + CU_CLUSTER_SCHEDULING_POLICY_DEFAULT = 0, /**< the default policy */ + CU_CLUSTER_SCHEDULING_POLICY_SPREAD = 1, /**< spread the blocks within a cluster to the SMs */ + CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING = 2 /**< allow the hardware to load-balance the blocks in a cluster to the SMs */ +} CUclusterSchedulingPolicy; + +typedef enum CUlaunchMemSyncDomain_enum { + CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT = 0, + CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE = 1 +} CUlaunchMemSyncDomain; + +typedef struct CUlaunchMemSyncDomainMap_st { + unsigned char default_; + unsigned char remote; +} CUlaunchMemSyncDomainMap; + +typedef enum CUlaunchAttributeID_enum { + CU_LAUNCH_ATTRIBUTE_IGNORE = 0 /**< Ignored entry, for convenient composition */ + , CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1 /**< Valid for streams, graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2 /**< Valid for graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< Valid for streams. */ + , CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4 /**< Valid for graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5 /**< Valid for graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6 /**< Valid for launches. Setting + programmaticStreamSerializationAllowed to non-0 + signals that the kernel will use programmatic + means to resolve its stream dependency, so that + the CUDA runtime should opportunistically allow + the grid's execution to overlap with the previous + kernel in the stream, if that kernel requests the + overlap. The dependent launches can choose to wait + on the dependency using the programmatic sync + (cudaGridDependencySynchronize() or equivalent PTX + instructions). */ + , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7 /**< Valid for launches. Event recorded through this + launch attribute is guaranteed to only trigger + after all block in the associated kernel trigger + the event. A block can trigger the event through + PTX launchdep.release or CUDA builtin function + cudaTriggerProgrammaticLaunchCompletion(). A + trigger can also be inserted at the beginning of + each block's execution if triggerAtBlockStart is + set to non-0. The dependent launches can choose to + wait on the dependency using the programmatic sync + (cudaGridDependencySynchronize() or equivalent PTX + instructions). Note that dependents (including the + CPU thread calling cuEventSynchronize()) are not + guaranteed to observe the release precisely when + it is released. For example, cuEventSynchronize() + may only observe the event trigger long after the + associated kernel has completed. This recording + type is primarily meant for establishing + programmatic dependency between device tasks. The + event supplied must not be an interprocess or + interop event. The event must disable timing (i.e. + created with ::CU_EVENT_DISABLE_TIMING flag set). + */ + , CU_LAUNCH_ATTRIBUTE_PRIORITY = 8 /**< Valid for streams, graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP = 9 + , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN = 10 +#ifdef __CUDA_API_VERSION_INTERNAL + , CU_LAUNCH_ATTRIBUTE_MAX +#endif +} CUlaunchAttributeID; + +typedef union CUlaunchAttributeValue_union { + char pad[64]; /**< Pad to 64 bytes */ + CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ + int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ + CUsynchronizationPolicy syncPolicy; /**< ::CUsynchronizationPolicy for work queued up in this stream */ + struct { + unsigned int x; + unsigned int y; + unsigned int z; + } clusterDim; /**< Cluster dimensions for the kernel node. */ + CUclusterSchedulingPolicy clusterSchedulingPolicyPreference; /**< Cluster scheduling policy preference for the kernel node. */ + int programmaticStreamSerializationAllowed; + struct { + CUevent event; + int flags; /* Does not accept ::CU_EVENT_RECORD_EXTERNAL */ + int triggerAtBlockStart; + } programmaticEvent; + int priority; /**< Execution priority of the kernel. */ + CUlaunchMemSyncDomainMap memSyncDomainMap; + CUlaunchMemSyncDomain memSyncDomain; +} CUlaunchAttributeValue; + +typedef struct CUlaunchAttribute_st { + CUlaunchAttributeID id; + char pad[8 - sizeof(CUlaunchAttributeID)]; + CUlaunchAttributeValue value; +} CUlaunchAttribute; + +typedef struct CUlaunchConfig_st { + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + CUstream hStream; /**< Stream identifier */ + CUlaunchAttribute *attrs; /**< nullable if numAttrs == 0 */ + unsigned int numAttrs; /**< number of attributes populated in attrs */ +} CUlaunchConfig; + +typedef CUlaunchAttributeID CUkernelNodeAttrID; +#define CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW +#define CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE CU_LAUNCH_ATTRIBUTE_COOPERATIVE +#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION +#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE +#define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY CU_LAUNCH_ATTRIBUTE_PRIORITY +#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP +#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN + +typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1; +typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; + +/** + * Possible stream capture statuses returned by ::cuStreamIsCapturing + */ +typedef enum CUstreamCaptureStatus_enum { + CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ + CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ + CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that + has been invalidated, but not terminated */ +} CUstreamCaptureStatus; + +/** + * Possible modes for stream capture thread interactions. For more details see + * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode + */ +typedef enum CUstreamCaptureMode_enum { + CU_STREAM_CAPTURE_MODE_GLOBAL = 0, + CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, + CU_STREAM_CAPTURE_MODE_RELAXED = 2 +} CUstreamCaptureMode; + +typedef CUlaunchAttributeID CUstreamAttrID; +#define CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW +#define CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY +#define CU_STREAM_ATTRIBUTE_PRIORITY CU_LAUNCH_ATTRIBUTE_PRIORITY +#define CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP +#define CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN + +typedef CUlaunchAttributeValue CUstreamAttrValue_v1; +typedef CUstreamAttrValue_v1 CUstreamAttrValue; + +/** + * Flags to specify search options. For more details see ::cuGetProcAddress + */ +typedef enum CUdriverProcAddress_flags_enum { + CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ + CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ + CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ +} CUdriverProcAddress_flags; + +/** + * Flags to indicate search status. For more details see ::cuGetProcAddress + */ +typedef enum CUdriverProcAddressQueryResult_enum { + CU_GET_PROC_ADDRESS_SUCCESS = 0, /**< Symbol was succesfully found */ + CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND = 1, /**< Symbol was not found in search */ + CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2 /**< Symbol was found but version supplied was not sufficient */ +} CUdriverProcAddressQueryResult; + +/** + * Execution Affinity Types + */ +typedef enum CUexecAffinityType_enum { + CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ + CU_EXEC_AFFINITY_TYPE_MAX +} CUexecAffinityType; + +/** + * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT + */ +typedef struct CUexecAffinitySmCount_st { + unsigned int val; /**< The number of SMs the context is limited to use. */ +} CUexecAffinitySmCount_v1; +typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; + +/** + * Execution Affinity Parameters + */ +typedef struct CUexecAffinityParam_st { + CUexecAffinityType type; + union { + CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ + } param; +} CUexecAffinityParam_v1; +/** + * Execution Affinity Parameters + */ +typedef CUexecAffinityParam_v1 CUexecAffinityParam; + +/** + * Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile() + */ +typedef enum CUlibraryOption_enum +{ + CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE = 0, + + /** + * Specifes that the argument \p code passed to ::cuLibraryLoadData() will be preserved. + * Specifying this option will let the driver know that \p code can be accessed at any point + * until ::cuLibraryUnload(). The default behavior is for the driver to allocate and + * maintain its own copy of \p code. Note that this is only a memory usage optimization + * hint and the driver can choose to ignore it if required. + * Specifying this option with ::cuLibraryLoadFromFile() is invalid and + * will return ::CUDA_ERROR_INVALID_VALUE. + */ + CU_LIBRARY_BINARY_IS_PRESERVED = 1, + + CU_LIBRARY_NUM_OPTIONS +} CUlibraryOption; + +typedef struct CUlibraryHostUniversalFunctionAndDataTable_st +{ + void *functionTable; + size_t functionWindowSize; + void *dataTable; + size_t dataWindowSize; +} CUlibraryHostUniversalFunctionAndDataTable; + +/** + * Error codes + */ +typedef enum cudaError_enum { + /** + * The API call returned with no errors. In the case of query calls, this + * also means that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + CUDA_SUCCESS = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + CUDA_ERROR_INVALID_VALUE = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + CUDA_ERROR_OUT_OF_MEMORY = 2, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + CUDA_ERROR_NOT_INITIALIZED = 3, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + CUDA_ERROR_DEINITIALIZED = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + CUDA_ERROR_PROFILER_DISABLED = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cuProfilerStart or + * ::cuProfilerStop without initialization. + */ + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStart() when profiling is already enabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStop() when profiling is already disabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + + /** + * This indicates that the CUDA driver that the application has loaded is a + * stub library. Applications that run with the stub rather than a real + * driver loaded will result in CUDA API returning this error. + */ + CUDA_ERROR_STUB_LIBRARY = 34, + + /** + * This indicates that requested CUDA device is unavailable at the current + * time. Devices are often unavailable due to use of + * ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED. + */ + CUDA_ERROR_DEVICE_UNAVAILABLE = 46, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + CUDA_ERROR_NO_DEVICE = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device or that the action requested is + * invalid for the specified device. + */ + CUDA_ERROR_INVALID_DEVICE = 101, + + /** + * This error indicates that the Grid license is not applied. + */ + CUDA_ERROR_DEVICE_NOT_LICENSED = 102, + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + CUDA_ERROR_INVALID_IMAGE = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + CUDA_ERROR_INVALID_CONTEXT = 201, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + + /** + * This indicates that a map or register operation has failed. + */ + CUDA_ERROR_MAP_FAILED = 205, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + CUDA_ERROR_UNMAP_FAILED = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + CUDA_ERROR_ARRAY_IS_MAPPED = 207, + + /** + * This indicates that the resource is already mapped. + */ + CUDA_ERROR_ALREADY_MAPPED = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + + /** + * This indicates that a resource has already been acquired. + */ + CUDA_ERROR_ALREADY_ACQUIRED = 210, + + /** + * This indicates that a resource is not mapped. + */ + CUDA_ERROR_NOT_MAPPED = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + CUDA_ERROR_ECC_UNCORRECTABLE = 214, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, + + /** + * This indicates that a PTX JIT compilation failed. + */ + CUDA_ERROR_INVALID_PTX = 218, + + /** + * This indicates an error with OpenGL or DirectX context. + */ + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, + + /** + * This indicates that the PTX JIT compiler library was not found. + */ + CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, + + /** + * This indicates that the provided PTX was compiled with an unsupported toolchain. + */ + + CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, + + /** + * This indicates that the PTX JIT compilation was disabled. + */ + CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, + + /** + * This indicates that the ::CUexecAffinityType passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, + + /** + * This indicates that the code to be compiled by the PTX JIT contains + * unsupported call to cudaDeviceSynchronize. + */ + CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = 225, + + /** + * This indicates that the device kernel source is invalid. This includes + * compilation/linker errors encountered in device code or user error. + */ + CUDA_ERROR_INVALID_SOURCE = 300, + + /** + * This indicates that the file specified was not found. + */ + CUDA_ERROR_FILE_NOT_FOUND = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + + /** + * This indicates that an OS call failed. + */ + CUDA_ERROR_OPERATING_SYSTEM = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + CUDA_ERROR_INVALID_HANDLE = 400, + + /** + * This indicates that a resource required by the API call is not in a + * valid state to perform the requested operation. + */ + CUDA_ERROR_ILLEGAL_STATE = 401, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, driver function names, texture names, + * and surface names. + */ + CUDA_ERROR_NOT_FOUND = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + CUDA_ERROR_NOT_READY = 600, + + /** + * While executing a kernel, the device encountered a + * load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_ADDRESS = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_TIMEOUT = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ASSERT = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_TOO_MANY_PEERS = 711, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + + /** + * While executing a kernel, the device encountered a stack error. + * This can be due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_HARDWARE_STACK_ERROR = 714, + + /** + * While executing a kernel, the device encountered an illegal instruction. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, + + /** + * While executing a kernel, the device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_MISALIGNED_ADDRESS = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, + + /** + * While executing a kernel, the device program counter wrapped its address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_PC = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. Less common cases can be system specific - more + * information about these cases can be found in the system specific user guide. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_FAILED = 719, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + */ + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, + + /** + * This error indicates that the attempted operation is not permitted. + */ + CUDA_ERROR_NOT_PERMITTED = 800, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + CUDA_ERROR_NOT_SUPPORTED = 801, + + /** + * This error indicates that the system is not yet ready to start any CUDA + * work. To continue using CUDA, verify the system configuration is in a + * valid state and all required driver daemons are actively running. + * More information about this error can be found in the system specific + * user guide. + */ + CUDA_ERROR_SYSTEM_NOT_READY = 802, + + /** + * This error indicates that there is a mismatch between the versions of + * the display driver and the CUDA driver. Refer to the compatibility documentation + * for supported versions. + */ + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, + + /** + * This error indicates that the system was upgraded to run with forward compatibility + * but the visible hardware detected by CUDA does not support this configuration. + * Refer to the compatibility documentation for the supported hardware matrix or ensure + * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + * environment variable. + */ + CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, + + /** + * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. + */ + CUDA_ERROR_MPS_CONNECTION_FAILED = 805, + + /** + * This error indicates that the remote procedural call between the MPS server and the MPS client failed. + */ + CUDA_ERROR_MPS_RPC_FAILURE = 806, + + /** + * This error indicates that the MPS server is not ready to accept new MPS client requests. + * This error can be returned when the MPS server is in the process of recovering from a fatal failure. + */ + CUDA_ERROR_MPS_SERVER_NOT_READY = 807, + + /** + * This error indicates that the hardware resources required to create MPS client have been exhausted. + */ + CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, + + /** + * This error indicates the the hardware resources required to support device connections have been exhausted. + */ + CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, + + /** + * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. + */ + CUDA_ERROR_MPS_CLIENT_TERMINATED = 810, + + /** + * This error indicates that the module is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it. + */ + CUDA_ERROR_CDP_NOT_SUPPORTED = 811, + + /** + * This error indicates that a module contains an unsupported interaction between different versions of CUDA Dynamic Parallelism. + */ + CUDA_ERROR_CDP_VERSION_MISMATCH = 812, + + /** + * This error indicates that the operation is not permitted when + * the stream is capturing. + */ + CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, + + /** + * This error indicates that the current capture sequence on the stream + * has been invalidated due to a previous error. + */ + CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, + + /** + * This error indicates that the operation would have resulted in a merge + * of two independent capture sequences. + */ + CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, + + /** + * This error indicates that the capture was not initiated in this stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, + + /** + * This error indicates that the capture sequence contains a fork that was + * not joined to the primary stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, + + /** + * This error indicates that a dependency would have been created which + * crosses the capture sequence boundary. Only implicit in-stream ordering + * dependencies are allowed to cross the boundary. + */ + CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, + + /** + * This error indicates a disallowed implicit dependency on a current capture + * sequence from cudaStreamLegacy. + */ + CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, + + /** + * This error indicates that the operation is not permitted on an event which + * was last recorded in a capturing stream. + */ + CUDA_ERROR_CAPTURED_EVENT = 907, + + /** + * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED + * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a + * different thread. + */ + CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, + + /** + * This error indicates that the timeout specified for the wait operation has lapsed. + */ + CUDA_ERROR_TIMEOUT = 909, + + /** + * This error indicates that the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + */ + CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, + + /** + * This indicates that an async error has occurred in a device outside of CUDA. + * If CUDA was waiting for an external device's signal before consuming shared data, + * the external device signaled an error indicating that the data is not valid for + * consumption. This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must be + * terminated and relaunched. + */ + CUDA_ERROR_EXTERNAL_DEVICE = 911, + + /** + * Indicates a kernel launch error due to cluster misconfiguration. + */ + CUDA_ERROR_INVALID_CLUSTER_SIZE = 912, + + /** + * This indicates that an unknown internal error has occurred. + */ + CUDA_ERROR_UNKNOWN = 999 +} CUresult; + +/** + * P2P Attributes + */ +typedef enum CUdevice_P2PAttribute_enum { + CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ + CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ +} CUdevice_P2PAttribute; + +/** + * CUDA stream callback + * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + * \param status ::CUDA_SUCCESS or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); + +/** + * Block size to per-block dynamic shared memory mapping for a certain + * kernel \param blockSize Block size of the kernel. + * + * \return The dynamic shared memory needed by a block. + */ +typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 + +/** + * If set, host memory is allocated as write-combined - fast to write, + * faster to DMA, slow to read except via SSE4 streaming load instruction + * (MOVNTDQA). + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 + +/** + * If set, the passed memory pointer is treated as pointing to some + * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. + * On Windows the flag is a no-op. + * On Linux that memory is marked as non cache-coherent for the GPU and + * is expected to be physically contiguous. It may return + * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, + * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. + * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED + * is returned. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_IOMEMORY 0x04 + +/** +* If set, the passed memory pointer is treated as pointing to memory that is +* considered read-only by the device. On platforms without +* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is +* required in order to register memory mapped to the CPU as read-only. Support +* for the use of this flag can be queried from the device attribute +* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with +* a current context associated with a device that does not have this attribute +* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED. +*/ +#define CU_MEMHOSTREGISTER_READ_ONLY 0x08 + +/** + * 2D memory copy parameters + */ +typedef struct CUDA_MEMCPY2D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ + size_t Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D_v2; +typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; + +/** + * 3D memory copy parameters + */ +typedef struct CUDA_MEMCPY3D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_v2; +typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; + +/** + * 3D memory cross-context copy parameters + */ +typedef struct CUDA_MEMCPY3D_PEER_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_PEER_v1; +typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; + +/** + * Array descriptor + */ +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + size_t Width; /**< Width of array */ + size_t Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR_v2; +typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; + +/** + * 3D array descriptor + */ +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + size_t Width; /**< Width of 3D array */ + size_t Height; /**< Height of 3D array */ + size_t Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR_v2; +typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; + +/** + * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers + */ +#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 + +/** + * CUDA array sparse properties + */ +typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { + struct { + unsigned int width; /**< Width of sparse tile in elements */ + unsigned int height; /**< Height of sparse tile in elements */ + unsigned int depth; /**< Depth of sparse tile in elements */ + } tileExtent; + + /** + * First mip level at which the mip tail begins. + */ + unsigned int miptailFirstLevel; + /** + * Total size of the mip tail. + */ + unsigned long long miptailSize; + /** + * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL + */ + unsigned int flags; + unsigned int reserved[4]; +} CUDA_ARRAY_SPARSE_PROPERTIES_v1; +typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; + +/** + * CUDA array memory requirements + */ +typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st { + size_t size; /**< Total required memory size */ + size_t alignment; /**< alignment requirement */ + unsigned int reserved[4]; +} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1; +typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS; + +/** + * CUDA Resource descriptor + */ +typedef struct CUDA_RESOURCE_DESC_st +{ + CUresourcetype resType; /**< Resource type */ + + union { + struct { + CUarray hArray; /**< CUDA array */ + } array; + struct { + CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC_v1; +typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; + +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} CUDA_TEXTURE_DESC_v1; +typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; + +/** + * Resource view format + */ +typedef enum CUresourceViewFormat_enum +{ + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; + +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st +{ + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC_v1; +typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; + +/** + * Size of tensor map descriptor + */ +#define CU_TENSOR_MAP_NUM_QWORDS 16 + +/** + * Tensor map descriptor. Requires compiler support for aligning to 64 bytes. + */ +typedef struct CUtensorMap_st { +#if __cplusplus >= 201103L + alignas(64) +#elif __STDC_VERSION__ >= 201112L + _Alignas(64) +#endif + cuuint64_t opaque[CU_TENSOR_MAP_NUM_QWORDS]; +} CUtensorMap; + +/** + * Tensor map data type + */ +typedef enum CUtensorMapDataType_enum { + CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, + CU_TENSOR_MAP_DATA_TYPE_UINT16, + CU_TENSOR_MAP_DATA_TYPE_UINT32, + CU_TENSOR_MAP_DATA_TYPE_INT32, + CU_TENSOR_MAP_DATA_TYPE_UINT64, + CU_TENSOR_MAP_DATA_TYPE_INT64, + CU_TENSOR_MAP_DATA_TYPE_FLOAT16, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32, + CU_TENSOR_MAP_DATA_TYPE_FLOAT64, + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, + CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ +} CUtensorMapDataType; + +/** + * Tensor map interleave layout type + */ +typedef enum CUtensorMapInterleave_enum { + CU_TENSOR_MAP_INTERLEAVE_NONE = 0, + CU_TENSOR_MAP_INTERLEAVE_16B, + CU_TENSOR_MAP_INTERLEAVE_32B +} CUtensorMapInterleave; + +/** + * Tensor map swizzling mode of shared memory banks + */ +typedef enum CUtensorMapSwizzle_enum { + CU_TENSOR_MAP_SWIZZLE_NONE = 0, + CU_TENSOR_MAP_SWIZZLE_32B, + CU_TENSOR_MAP_SWIZZLE_64B, + CU_TENSOR_MAP_SWIZZLE_128B +} CUtensorMapSwizzle; + +/** + * Tensor map L2 promotion type + */ +typedef enum CUtensorMapL2promotion_enum { + CU_TENSOR_MAP_L2_PROMOTION_NONE = 0, + CU_TENSOR_MAP_L2_PROMOTION_L2_64B, + CU_TENSOR_MAP_L2_PROMOTION_L2_128B, + CU_TENSOR_MAP_L2_PROMOTION_L2_256B +} CUtensorMapL2promotion; + +/** + * Tensor map out-of-bounds fill type + */ +typedef enum CUtensorMapFloatOOBfill_enum { + CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0, + CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA +} CUtensorMapFloatOOBfill; + +/** + * GPU Direct v3 tokens + */ +typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { + unsigned long long p2pToken; + unsigned int vaSpaceToken; +} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; +typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + +/** +* Access flags that specify the level of access the current context's device has +* on the memory referenced. +*/ +typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ +} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; + +/** + * Kernel launch parameters + */ +typedef struct CUDA_LAUNCH_PARAMS_st { + CUfunction function; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + CUstream hStream; /**< Stream identifier */ + void **kernelParams; /**< Array of pointers to kernel parameters */ +} CUDA_LAUNCH_PARAMS_v1; +typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; + +/** + * External memory handle types + */ +typedef enum CUexternalMemoryHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a D3D12 heap object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + /** + * Handle is a D3D12 committed resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, + /** + * Handle is a shared NT handle to a D3D11 resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, + /** + * Handle is a globally shared handle to a D3D11 resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, + /** + * Handle is an NvSciBuf object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 +} CUexternalMemoryHandleType; + +/** + * Indicates that the external memory object is a dedicated resource + */ +#define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 + +/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS + * contains this flag, it indicates that signaling an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 + +/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS + * contains this flag, it indicates that waiting on an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 + +/** + * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application needs signaler specific NvSciSyncAttr + * to be filled by ::cuDeviceGetNvSciSyncAttributes. + */ +#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 + +/** + * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application needs waiter specific NvSciSyncAttr + * to be filled by ::cuDeviceGetNvSciSyncAttributes. + */ +#define CUDA_NVSCISYNC_ATTR_WAIT 0x2 +/** + * External memory handle descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalMemoryHandleType type; + union { + /** + * File descriptor referencing the memory object. Valid + * when type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid memory object. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * A handle representing an NvSciBuf Object. Valid when type + * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF + */ + const void *nvSciBufObject; + } handle; + /** + * Size of the memory allocation + */ + unsigned long long size; + /** + * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + +/** + * External memory buffer descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + /** + * Offset into the memory object where the buffer's base is + */ + unsigned long long offset; + /** + * Size of the buffer + */ + unsigned long long size; + /** + * Flags reserved for future use. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + +/** + * External memory mipmap descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + /** + * Offset into the memory object where the base level of the + * mipmap chain is. + */ + unsigned long long offset; + /** + * Format, dimension and type of base level of the mipmap chain + */ + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + /** + * Total number of levels in the mipmap chain + */ + unsigned int numLevels; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + +/** + * External semaphore handle types + */ +typedef enum CUexternalSemaphoreHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a shared NT handle referencing a D3D12 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, + /** + * Handle is a shared NT handle referencing a D3D11 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, + /** + * Opaque handle to NvSciSync Object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, + /** + * Handle is a shared NT handle referencing a D3D11 keyed mutex object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, + /** + * Handle is a globally shared handle referencing a D3D11 keyed mutex object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, + /** + * Handle is an opaque file descriptor referencing a timeline semaphore + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, + /** + * Handle is an opaque shared NT handle referencing a timeline semaphore + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 +} CUexternalSemaphoreHandleType; + +/** + * External semaphore handle descriptor + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalSemaphoreHandleType type; + union { + /** + * File descriptor referencing the semaphore object. Valid + * when type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid synchronization primitive. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * Valid NvSciSyncObj. Must be non NULL + */ + const void* nvSciSyncObj; + } handle; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + +/** + * External semaphore signal parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType + * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + unsigned int reserved[12]; + } params; + /** + * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to + * signal a ::CUexternalSemaphore of type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates + * that while signaling the ::CUexternalSemaphore, no memory synchronization + * operations should be performed for any external memory object imported + * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. + * For all other types of ::CUexternalSemaphore, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; + +/** + * External semaphore wait parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + /** + * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType + * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. + */ + union { + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + unsigned int reserved[10]; + } params; + /** + * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on + * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC + * which indicates that while waiting for the ::CUexternalSemaphore, no memory + * synchronization operations should be performed for any external memory + * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. + * For all other types of ::CUexternalSemaphore, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; + +/** + * Semaphore signal node parameters + */ +typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { + CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; +typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; + +/** + * Semaphore wait node parameters + */ +typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { + CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; +typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; + +typedef unsigned long long CUmemGenericAllocationHandle_v1; +typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; + +/** + * Flags for specifying particular handle types + */ +typedef enum CUmemAllocationHandleType_enum { + CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ + CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ + CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ + CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationHandleType; + +/** + * Specifies the memory protection flags for mapping. + */ +typedef enum CUmemAccess_flags_enum { + CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ + CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF +} CUmemAccess_flags; + +/** + * Specifies the type of location + */ +typedef enum CUmemLocationType_enum { + CU_MEM_LOCATION_TYPE_INVALID = 0x0, + CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ + CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemLocationType; + +/** +* Defines the allocation types available +*/ +typedef enum CUmemAllocationType_enum { + CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, + + /** This allocation type is 'pinned', i.e. cannot migrate from its current + * location while the application is actively using it + */ + CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, + CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationType; + +/** +* Flag for requesting different optimal and required granularities for an allocation. +*/ +typedef enum CUmemAllocationGranularity_flags_enum { + CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ + CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ +} CUmemAllocationGranularity_flags; + +/** +* Specifies the handle type for address range +*/ +typedef enum CUmemRangeHandleType_enum +{ + CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1, + CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF +} CUmemRangeHandleType; + +/** + * Sparse subresource types + */ +typedef enum CUarraySparseSubresourceType_enum { + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 +} CUarraySparseSubresourceType; + +/** + * Memory operation types + */ +typedef enum CUmemOperationType_enum { + CU_MEM_OPERATION_TYPE_MAP = 1, + CU_MEM_OPERATION_TYPE_UNMAP = 2 +} CUmemOperationType; + +/** + * Memory handle types + */ +typedef enum CUmemHandleType_enum { + CU_MEM_HANDLE_TYPE_GENERIC = 0 +} CUmemHandleType; + +/** + * Specifies the CUDA array or CUDA mipmapped array memory mapping information + */ +typedef struct CUarrayMapInfo_st { + CUresourcetype resourceType; /**< Resource type */ + + union { + CUmipmappedArray mipmap; + CUarray array; + } resource; + + CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ + + union { + struct { + unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ + unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ + unsigned int offsetX; /**< Starting X offset in elements */ + unsigned int offsetY; /**< Starting Y offset in elements */ + unsigned int offsetZ; /**< Starting Z offset in elements */ + unsigned int extentWidth; /**< Width in elements */ + unsigned int extentHeight; /**< Height in elements */ + unsigned int extentDepth; /**< Depth in elements */ + } sparseLevel; + struct { + unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ + unsigned long long offset; /**< Offset within mip tail */ + unsigned long long size; /**< Extent in bytes */ + } miptail; + } subresource; + + CUmemOperationType memOperationType; /**< Memory operation type */ + CUmemHandleType memHandleType; /**< Memory handle type */ + + union { + CUmemGenericAllocationHandle memHandle; + } memHandle; + + unsigned long long offset; /**< Offset within the memory */ + unsigned int deviceBitMask; /**< Device ordinal bit mask */ + unsigned int flags; /**< flags for future use, must be zero now. */ + unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ +} CUarrayMapInfo_v1; +typedef CUarrayMapInfo_v1 CUarrayMapInfo; + +/** + * Specifies a memory location. + */ +typedef struct CUmemLocation_st { + CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ + int id; /**< identifier for a given this location's ::CUmemLocationType. */ +} CUmemLocation_v1; +typedef CUmemLocation_v1 CUmemLocation; + +/** + * Specifies compression attribute for an allocation. + */ +typedef enum CUmemAllocationCompType_enum { + CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ + CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ +} CUmemAllocationCompType; + +/** + * This flag if set indicates that the memory will be used as a tile pool. + */ +#define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 + +/** +* Specifies the allocation properties for a allocation. +*/ +typedef struct CUmemAllocationProp_st { + /** Allocation type */ + CUmemAllocationType type; + /** requested ::CUmemAllocationHandleType */ + CUmemAllocationHandleType requestedHandleTypes; + /** Location of allocation */ + CUmemLocation location; + /** + * Windows-specific POBJECT_ATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes structure + * includes security attributes that define + * the scope of which exported allocations may be transferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32HandleMetaData; + struct { + /** + * Allocation hint for requesting compressible memory. + * On devices that support Compute Data Compression, compressible + * memory can be used to accelerate accesses to data with unstructured + * sparsity and other compressible data patterns. Applications are + * expected to query allocation property of the handle obtained with + * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to + * validate if the obtained allocation is compressible or not. Note that + * compressed memory may not be mappable on all devices. + */ + unsigned char compressionType; + unsigned char gpuDirectRDMACapable; + /** Bitmask indicating intended usage for this allocation */ + unsigned short usage; + unsigned char reserved[4]; + } allocFlags; +} CUmemAllocationProp_v1; +typedef CUmemAllocationProp_v1 CUmemAllocationProp; + +/** +* Flags for querying different granularities for a multicast object +*/ +typedef enum CUmulticastGranularity_flags_enum { + CU_MULTICAST_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity */ + CU_MULTICAST_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for best performance */ +} CUmulticastGranularity_flags; + +/** +* Specifies the properties for a multicast object. +*/ +typedef struct CUmulticastObjectProp_st { + /** + * The number of devices in the multicast team that will bind memory to this + * object + */ + unsigned int numDevices; + /** + * The maximum amount of memory that can be bound to this multicast object + * per device + */ + size_t size; + /** + * Bitmask of exportable handle types (see ::CUmemAllocationHandleType) for + * this object + */ + unsigned long long handleTypes; + /** + * Flags for future use, must be zero now + */ + unsigned long long flags; +} CUmulticastObjectProp_v1; +typedef CUmulticastObjectProp_v1 CUmulticastObjectProp; + +/** + * Memory access descriptor + */ +typedef struct CUmemAccessDesc_st { + CUmemLocation location; /**< Location on which the request is to change it's accessibility */ + CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ +} CUmemAccessDesc_v1; +typedef CUmemAccessDesc_v1 CUmemAccessDesc; + +/** + * CUDA Graph Update error types + */ +typedef enum CUgraphExecUpdateResult_enum { + CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ + CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ + CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ + CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ + CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ + CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ + CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ + CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */ + CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */ +} CUgraphExecUpdateResult; + +/** + * Result information returned by cuGraphExecUpdate + */ +typedef struct CUgraphExecUpdateResultInfo_st { + /** + * Gives more specific detail when a cuda graph update fails. + */ + CUgraphExecUpdateResult result; + + /** + * The "to node" of the error edge when the topologies do not match. + * The error node when the error is associated with a specific node. + * NULL when the error is generic. + */ + CUgraphNode errorNode; + + /** + * The from node of error edge when the topologies do not match. Otherwise NULL. + */ + CUgraphNode errorFromNode; +} CUgraphExecUpdateResultInfo_v1; +typedef CUgraphExecUpdateResultInfo_v1 CUgraphExecUpdateResultInfo; + +/** + * CUDA memory pool attributes + */ +typedef enum CUmemPool_attribute_enum { + /** + * (value type = int) + * Allow cuMemAllocAsync to use memory asynchronously freed + * in another streams as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + */ + CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, + + /** + * (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + */ + CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, + + /** + * (value type = int) + * Allow cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by cuFreeAsync (default enabled). + */ + CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, + + /** + * (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + */ + CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + + /** + * (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool. + */ + CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. High watermark can only be reset to zero. + */ + CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, + + /** + * (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + */ + CU_MEMPOOL_ATTR_USED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application since + * the last time it was reset. High watermark can only be reset to zero. + */ + CU_MEMPOOL_ATTR_USED_MEM_HIGH +} CUmemPool_attribute; + +/** + * Specifies the properties of allocations made from the pool. + */ +typedef struct CUmemPoolProps_st { + CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ + CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ + CUmemLocation location; /**< Location where allocations should reside. */ + /** + * Windows-specific LPSECURITYATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines + * the scope of which exported allocations may be transferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32SecurityAttributes; + unsigned char reserved[64]; /**< reserved for future use, must be 0 */ +} CUmemPoolProps_v1; +typedef CUmemPoolProps_v1 CUmemPoolProps; + +/** + * Opaque data for exporting a pool allocation + */ +typedef struct CUmemPoolPtrExportData_st { + unsigned char reserved[64]; +} CUmemPoolPtrExportData_v1; +typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; + +/** + * Memory allocation node parameters + */ +typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { + /** + * in: location where the allocation should reside (specified in ::location). + * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. + */ + CUmemPoolProps poolProps; + const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ + size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ + size_t bytesize; /**< in: size in bytes of the requested allocation */ + CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ +} CUDA_MEM_ALLOC_NODE_PARAMS; + +typedef enum CUgraphMem_attribute_enum { + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently associated with graphs + */ + CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + */ + CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, + + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH +} CUgraphMem_attribute; + +/** + * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 + +/** + * If set, the CUDA array is a collection of layers, where each layer is either a 1D + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + * of layers, not the depth of a 3D array. + */ +#define CUDA_ARRAY3D_LAYERED 0x01 + +/** + * Deprecated, use CUDA_ARRAY3D_LAYERED + */ +#define CUDA_ARRAY3D_2DARRAY 0x01 + +/** + * This flag must be set in order to bind a surface reference + * to the CUDA array + */ +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 + +/** + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The + * width of such a CUDA array must be equal to its height, and Depth must be six. + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps + * and Depth must be a multiple of six. + */ +#define CUDA_ARRAY3D_CUBEMAP 0x04 + +/** + * This flag must be set in order to perform texture gather operations + * on a CUDA array. + */ +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 + +/** + * This flag if set indicates that the CUDA + * array is a DEPTH_TEXTURE. + */ +#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 + +/** + * This flag indicates that the CUDA array may be bound as a color target + * in an external graphics API + */ +#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 + +/** + * This flag if set indicates that the CUDA array or CUDA mipmapped array + * is a sparse CUDA array or CUDA mipmapped array respectively + */ +#define CUDA_ARRAY3D_SPARSE 0x40 + +/** + * This flag if set indicates that the CUDA array or CUDA mipmapped array + * will allow deferred memory mapping + */ +#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80 + +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_SRGB 0x10 + + /** + * Disable any trilinear filtering optimizations. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 + +/** + * Enable seamless cube map filtering. + * Flag for ::cuTexObjectCreate() + */ +#define CU_TRSF_SEAMLESS_CUBEMAP 0x40 + +/** + * C++ compile time constant for CU_LAUNCH_PARAM_END + */ +#define CU_LAUNCH_PARAM_END_AS_INT 0x00 + +/** + * End of array terminator for the \p extra parameter to + * ::cuLaunchKernel + */ +#define CU_LAUNCH_PARAM_END ((void*)CU_LAUNCH_PARAM_END_AS_INT) + +/** + * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT 0x01 + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel + * parameters used for launching kernel \p f. This buffer needs to + * honor all alignment/padding requirements of the individual parameters. + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no + * effect. + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT) + +/** + * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT 0x02 + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a size_t which contains the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified + * in the \p extra array if the value associated with + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT) + +/** + * For texture references loaded into the module, use default texunit from + * texture reference. + */ +#define CU_PARAM_TR_DEFAULT -1 + +/** + * Device that represents the CPU + */ +#define CU_DEVICE_CPU ((CUdevice)-1) + +/** + * Device that represents an invalid device + */ +#define CU_DEVICE_INVALID ((CUdevice)-2) + +/** + * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS + */ +typedef enum CUflushGPUDirectRDMAWritesOptions_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ +} CUflushGPUDirectRDMAWritesOptions; + +/** + * Platform native ordering for GPUDirect RDMA writes + */ +typedef enum CUGPUDirectRDMAWritesOrdering_enum { + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ +} CUGPUDirectRDMAWritesOrdering; + +/** + * The scopes for ::cuFlushGPUDirectRDMAWrites + */ +typedef enum CUflushGPUDirectRDMAWritesScope_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ +} CUflushGPUDirectRDMAWritesScope; + +/** + * The targets for ::cuFlushGPUDirectRDMAWrites + */ +typedef enum CUflushGPUDirectRDMAWritesTarget_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ +} CUflushGPUDirectRDMAWritesTarget; + +/** + * The additional write options for ::cuGraphDebugDotPrint + */ +typedef enum CUgraphDebugDot_flags_enum { + CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /**< Output all debug data as if every debug flag is enabled */ + CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /**< Use CUDA Runtime structures for output */ + CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /**< Adds CUDA_KERNEL_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /**< Adds CUDA_MEMCPY3D values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /**< Adds CUDA_MEMSET_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /**< Adds CUDA_HOST_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /**< Adds CUevent handle from record and wait nodes to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /**< Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /**< Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /**< Adds CUkernelNodeAttrValue values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /**< Adds node handles and every kernel function handle to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /**< Adds memory alloc node parameters to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12, /**< Adds memory free node parameters to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 1<<13 /**< Adds batch mem op node parameters to output */ + , CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO = 1<<14 /**< Adds edge numbering information */ +} CUgraphDebugDot_flags; + +/** + * Flags for user objects for graphs + */ +typedef enum CUuserObject_flags_enum { + CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ +} CUuserObject_flags; + +/** + * Flags for retaining user object references for graphs + */ +typedef enum CUuserObjectRetain_flags_enum { + CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ +} CUuserObjectRetain_flags; + +/** + * Flags for instantiating a graph + */ +typedef enum CUgraphInstantiate_flags_enum { + CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ + , CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD = 2 /**< Automatically upload the graph after instantiaton. */ + , CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH = 4 /**< Instantiate the graph to be launchable from the device. */ + , CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY = 8 /**< Run the graph using the per-node priority attributes rather than the + priority of the stream it is launched into. */ +} CUgraphInstantiate_flags; + +typedef enum cuDeviceIsNumaNode_enum { + CU_DEVICE_NOT_NUMA_NODE = 0x01, + CU_DEVICE_IS_NUMA_NODE = 0x02, +} cuDeviceIsNumaNode; + +/** @} */ /* END CUDA_TYPES */ + +#if defined(__GNUC__) + #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) + #pragma GCC visibility push(default) + #endif +#endif + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +/** + * \defgroup CUDA_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets the string description of an error code + * + * Sets \p *pStr to the address of a NULL-terminated string description + * of the error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorString + */ +CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); + +/** + * \brief Gets the string representation of an error code enum name + * + * Sets \p *pStr to the address of a NULL-terminated string representation + * of the name of the enum error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorName + */ +CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); + +/** @} */ /* END CUDA_ERROR */ + +/** + * \defgroup CUDA_INITIALIZE Initialization + * + * ___MANBRIEF___ initialization functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the initialization functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Initialize the CUDA driver API + * Initializes the driver API and must be called before any other function from + * the driver API in the current process. Currently, the \p Flags parameter must be 0. If ::cuInit() + * has not been called, any function from the driver API will return + * ::CUDA_ERROR_NOT_INITIALIZED. + * + * \param Flags - Initialization flag for CUDA. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, + * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE + * \notefnerr + */ +CUresult CUDAAPI cuInit(unsigned int Flags); + +/** @} */ /* END CUDA_INITIALIZE */ + +/** + * \defgroup CUDA_VERSION Version Management + * + * ___MANBRIEF___ version management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the version management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the latest CUDA version supported by driver + * + * Returns in \p *driverVersion the version of CUDA supported by + * the driver. The version is returned as + * (1000 × major + 10 × minor). For example, CUDA 9.2 + * would be represented by 9020. + * + * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if + * \p driverVersion is NULL. + * + * \param driverVersion - Returns the CUDA driver version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cudaDriverGetVersion, + * ::cudaRuntimeGetVersion + */ +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); + +/** @} */ /* END CUDA_VERSION */ + +/** + * \defgroup CUDA_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given an ordinal in the range [0, + * ::cuDeviceGetCount()-1]. + * + * \param device - Returned device handle + * \param ordinal - Device number to get handle for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport + */ +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * than or equal to 2.0 that are available for execution. If there is no such + * device, ::cuDeviceGetCount() returns 0. + * + * \param count - Returned number of compute-capable devices + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceCount + */ +CUresult CUDAAPI cuDeviceGetCount(int *count); + +/** + * \brief Returns an identifier string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p name. \p len specifies the maximum length of the + * string that may be returned. + * + * \param name - Returned identifier string for the device + * \param len - Maximum length of string to store in \p name + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceGetCount, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); + +/** + * \brief Return an UUID for the device + * + * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will + * supplant this version in 12.0, which is retained for minor version compatibility. + * + * Returns 16-octets identifying the device \p dev in the structure + * pointed by the \p uuid. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetUuid_v2 + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); + +/** + * \brief Return an UUID for the device (11.4+) + * + * Returns 16-octets identifying the device \p dev in the structure + * pointed by the \p uuid. If the device is in MIG mode, returns its + * MIG UUID which uniquely identifies the subscribed MIG compute instance. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); + +/** + * \brief Return an LUID and device node mask for the device + * + * Return identifying information (\p luid and \p deviceNodeMask) to allow + * matching device with graphics APIs. + * + * \param luid - Returned LUID + * \param deviceNodeMask - Returned device node mask + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); + +/** + * \brief Returns the total amount of memory on the device + * + * Returns in \p *bytes the total amount of memory available on the device + * \p dev in bytes. + * + * \param bytes - Returned memory available on device in bytes + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); + +/** + * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. + * + * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture + * for given \p format and \p numChannels. + * + * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. + * \param format - Texture format. + * \param numChannels - Number of channels per texture element. + * \param dev - Device handle. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cudaMemGetInfo, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); + +/** + * \brief Returns information about the device + * + * Returns in \p *pi the integer value of the attribute \p attrib on device + * \p dev. The supported attributes are: + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per + * block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of + * shared memory available to a thread block in bytes + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the + * memory copy functions that involve memory regions allocated through + * ::cuMemAllocPitch() + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D + * texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width + * for a 1D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum + * mipmapped 1D texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D + * texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D + * texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width + * for a 2D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height + * for a 2D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch + * in bytes for a 2D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum + * mipmapped 2D texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum + * mipmapped 2D texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D + * texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D + * texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D + * texture depth + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: + * Alternate maximum 3D texture width, 0 if no alternate + * maximum 3D texture size is supported + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: + * Alternate maximum 3D texture height, 0 if no alternate + * maximum 3D texture size is supported + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: + * Alternate maximum 3D texture depth, 0 if no alternate + * maximum 3D texture size is supported + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: + * Maximum cubemap texture width or height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: + * Maximum 1D layered texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered texture + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: + * Maximum 2D layered texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: + * Maximum 2D layered texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered texture + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered texture width or height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered texture + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: + * Maximum 1D surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: + * Maximum 2D surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: + * Maximum 2D surface height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: + * Maximum 3D surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: + * Maximum 3D surface height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: + * Maximum 3D surface depth + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: + * Maximum 1D layered surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered surface + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: + * Maximum 2D layered surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: + * Maximum 2D layered surface height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered surface + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: + * Maximum cubemap surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered surface + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit + * registers available to a thread block + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture + * base addresses aligned to ::textureAlign bytes do not need an offset + * applied to texture fetches + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement + * for 2D texture references bound to pitched memory + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy + * memory between host and device while executing a kernel, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on + * the device + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit + * for kernels executed on the device, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the + * memory subsystem, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host + * memory into the CUDA address space, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently + * in. Available modes are as follows: + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and + * can have multiple CUDA contexts present at a single time. + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is + * prohibited from creating new CUDA contexts. + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device + * can have only one context used by a single process at a time. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports + * executing multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident + * on the device concurrently so this feature should not be relied upon for + * correctness. + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the + * device, 0 if error correction is disabled or not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier + * of the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC + * is only available on Tesla hardware running Windows Vista or later + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with + * the host, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals + * in L1 cache, 0 if caching globals in L1 cache is not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals + * in L1 cache, 0 if caching locals in L1 cache is not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of + * shared memory available to a multiprocessor in bytes; this amount is shared + * by all thread blocks simultaneously resident on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit + * registers available to a multiprocessor; this number is shared by all thread + * blocks simultaneously resident on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory + * on this system, 0 if allocating managed memory is not supported by the device on this system. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices + * associated with the same board. Devices on the same multi-GPU board will share the same identifier. + * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host + * supports native atomic operations. + * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance + * (in floating-point operations per second) to double precision performance. + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing + * pageable memory without calling cudaHostRegister on it. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory + * concurrently with the CPU. + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. + * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered + * memory at the same virtual address as the CPU. + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size + * supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() or cuKernelSetAttribute() call. + * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's + * page tables. + * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. + * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes + * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate. + * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes + * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. + * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. + * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC + * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays. + * + * \param pi - Returned device attribute value + * \param attrib - Device attribute to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaDeviceGetAttribute, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + +/** + * \brief Return NvSciSync attributes that this device can support. + * + * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that + * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList + * can be used to create an NvSciSync object that matches this device's capabilities. + * + * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is + * already set this API will return ::CUDA_ERROR_INVALID_VALUE. + * + * The applications should set \p nvSciSyncAttrList to a valid + * NvSciSyncAttrList failing which this API will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * The \p flags controls how applications intends to use + * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: + * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to + * signal an NvSciSync on this CUDA device. + * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to + * wait on an NvSciSync on this CUDA device. + * + * At least one of these flags must be set, failing which the API + * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal + * to one another: a developer may set both these flags that allows to + * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. + * + * Note that this API updates the input \p nvSciSyncAttrList with values equivalent + * to the following public attribute key-values: + * NvSciSyncAttrKey_RequiredPerm is set to + * - NvSciSyncAccessPerm_SignalOnly if ::CUDA_NVSCISYNC_ATTR_SIGNAL is set in \p flags. + * - NvSciSyncAccessPerm_WaitOnly if ::CUDA_NVSCISYNC_ATTR_WAIT is set in \p flags. + * - NvSciSyncAccessPerm_WaitSignal if both ::CUDA_NVSCISYNC_ATTR_WAIT and + * ::CUDA_NVSCISYNC_ATTR_SIGNAL are set in \p flags. + * NvSciSyncAttrKey_PrimitiveInfo is set to + * - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid \p device. + * - NvSciSyncAttrValPrimitiveType_Syncpoint if \p device is a Tegra device. + * - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if \p device is GA10X+. + * NvSciSyncAttrKey_GpuId is set to the same UUID that is returned for this + * \p device from ::cuDeviceGetUuid. + * + * \param nvSciSyncAttrList - Return NvSciSync attributes supported. + * \param dev - Valid Cuda Device to get NvSciSync attributes for. + * \param flags - flags describing NvSciSync usage. + * + * \return + * + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa + * ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); + +/** + * \brief Sets the current memory pool of a device + * + * The memory pool must be local to the specified device. + * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. + * By default, a device's current memory pool is its default memory pool. + * + * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different + * than the one the stream runs on. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync + */ +CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); + +/** + * \brief Gets the current mempool for a device + * + * Returns the last pool provided to ::cuDeviceSetMemPool for this device + * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. + * By default the current mempool is the default mempool for a device. + * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool + */ +CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); + +/** + * \brief Returns the default mempool of a device + * + * The default mempool of a device contains device memory from that device. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); + +/** + * \brief Returns information about the execution affinity support of the device. + * + * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. + * The supported types are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, + * or 0 if not; + * + * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not + * \param type - Execution affinity type to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); + +/** + * \brief Blocks until remote writes are visible to the specified scope + * + * Blocks until GPUDirect RDMA writes to the target context via mappings + * created through APIs like nvidia_p2p_get_pages (see + * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are + * visible to the specified scope. + * + * If the scope equals or lies within the scope indicated by + * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call + * will be a no-op and can be safely omitted for performance. This can be + * determined by comparing the numerical values between the two enums, with + * smaller scopes having smaller values. + * + * Users may query support for this API via + * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. + * + * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget + * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + */ +CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); + +/** @} */ /* END CUDA_DEVICE */ + +/** + * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated device management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns properties for a selected device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). + * + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop + * structure is defined as: + * + * \code + typedef struct CUdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign + } CUdevprop; + * \endcode + * where: + * + * - ::maxThreadsPerBlock is the maximum number of threads per block; + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; + * - ::sharedMemPerBlock is the total amount of shared memory available per + * block in bytes; + * - ::totalConstantMemory is the total amount of constant memory available on + * the device in bytes; + * - ::SIMDWidth is the warp size; + * - ::memPitch is the maximum pitch allowed by the memory copy functions that + * involve memory regions allocated through ::cuMemAllocPitch(); + * - ::regsPerBlock is the total number of registers available per block; + * - ::clockRate is the clock frequency in kilohertz; + * - ::textureAlign is the alignment requirement; texture base addresses that + * are aligned to ::textureAlign bytes do not need an offset applied to + * texture fetches. + * + * \param prop - Returned properties of device + * \param dev - Device to get properties for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); + +/** + * \brief Returns the compute capability of the device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and its functionality superseded + * by ::cuDeviceGetAttribute(). + * + * Returns in \p *major and \p *minor the major and minor revision numbers that + * define the compute capability of the device \p dev. + * + * \param major - Major revision number + * \param minor - Minor revision number + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE_DEPRECATED */ + +/** + * \defgroup CUDA_PRIMARY_CTX Primary Context Management + * + * ___MANBRIEF___ primary context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the primary context management functions of the low-level + * CUDA driver application programming interface. + * + * The primary context is unique per device and shared with the CUDA runtime API. + * These functions allow integration with other libraries using CUDA. + * + * @{ + */ + +/** + * \brief Retain the primary context on the GPU + * + * Retains the primary context on the device. + * Once the user successfully retains the primary context, the primary context + * will be active and available to the user until the user releases it + * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). + * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. + * + * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN + * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function + * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to + * determine the compute mode of the device. + * The nvidia-smi tool can be used to set the compute mode for + * devices. Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * Please note that the primary context always supports pinned allocations. Other + * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). + * + * \param pctx - Returned context handle of the new context + * \param dev - Device for which primary context is requested + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRelease, + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); + +/** + * \brief Release the primary context on the GPU + * + * Releases the primary context interop on the device. + * A retained context should always be released once the user is done using + * it. The context is automatically reset once the last reference to it is + * released. This behavior is different when the primary context was retained + * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary + * context remains always active. + * + * Releasing a primary context that has not been previously retained will + * fail with ::CUDA_ERROR_INVALID_CONTEXT. + * + * Please note that unlike ::cuCtxDestroy() this method does not pop the context + * from stack in any circumstances. + * + * \param dev - Device which primary context is released + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + +/** + * \brief Set flags for the primary context + * + * Sets the flags for the primary context on the device overwriting perviously + * set ones. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage.
+ * Deprecated: This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * + * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally + * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can + * be set during context creation to instruct CUDA to create a coredump if + * this context raises an exception during execution. These environment variables + * are described in the CUDA-GDB user guide under the "GPU core dump support" + * section. + * The initial settings will be taken from the global settings at the time of + * context creation. The other settings that control coredump output can be + * modified by calling ::cuCoredumpSetAttribute from the created context after + * it becomes current. + * + * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not + * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment + * variables, this flag can be set during context creation to instruct CUDA to + * create a coredump if data is written to a certain pipe that is present in the + * OS space. These environment variables are described in the CUDA-GDB user + * guide under the "GPU core dump support" section. + * It is important to note that the pipe name *must* be set with + * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is + * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set. + * The initial settings will be taken from the global settings at the time of + * context creation. The other settings that control coredump output can be + * modified by calling ::cuCoredumpSetAttribute from the created context after + * it becomes current. + * + * \param dev - Device for which the primary context flags are set + * \param flags - New flags for the device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxGetState, + * ::cuCtxCreate, + * ::cuCtxGetFlags, + * ::cuCtxSetFlags, + * ::cudaSetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + +/** + * \brief Get the state of the primary context + * + * Returns in \p *flags the flags for the primary context of \p dev, and in + * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag + * values. + * + * \param dev - Device to get primary context flags for + * \param flags - Pointer to store flags + * \param active - Pointer to store context state; 0 = inactive, 1 = active + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxGetFlags, + * ::cuCtxSetFlags, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); + +/** + * \brief Destroy all allocations and reset all state on the primary context + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. + * + * Note that it is responsibility of the calling function to ensure that no + * other module in the process is using the device any more. For that reason + * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. + * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() + * even after resetting the device. + * Resetting the primary context does not release it, an application that has + * retained the primary context should explicitly release its usage. + * + * \param dev - Device for which primary context is destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxRelease, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceReset + */ +CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + +/** @} */ /* END CUDA_PRIMARY_CTX */ + +/** + * \defgroup CUDA_CTX Context Management + * + * ___MANBRIEF___ context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the context management functions of the low-level + * CUDA driver application programming interface. + * + * Please note that some functions are described in + * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. + * + * @{ + */ + +/** + * \brief Create a CUDA context + * + * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. + * + * Creates a new CUDA context and associates it with the calling thread. The + * \p flags parameter is described below. The context is created with a usage + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() + * when done using the context. If a context is already current to the thread, + * it is supplanted by the newly created context and may be restored by a subsequent + * call to ::cuCtxPopCurrent(). + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage.
+ * Deprecated: This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). + * + * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally + * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can + * be set during context creation to instruct CUDA to create a coredump if + * this context raises an exception during execution. These environment variables + * are described in the CUDA-GDB user guide under the "GPU core dump support" + * section. + * The initial attributes will be taken from the global attributes at the time of + * context creation. The other attributes that control coredump output can be + * modified by calling ::cuCoredumpSetAttribute from the created context after + * it becomes current. + * + * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not + * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment + * variables, this flag can be set during context creation to instruct CUDA to + * create a coredump if data is written to a certain pipe that is present in the + * OS space. These environment variables are described in the CUDA-GDB user + * guide under the "GPU core dump support" section. + * It is important to note that the pipe name *must* be set with + * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is + * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set. + * The initial attributes will be taken from the global attributes at the time of + * context creation. The other attributes that control coredump output can be + * modified by calling ::cuCoredumpSetAttribute from the created context after + * it becomes current. + * Setting this flag on any context creation is equivalent to setting the + * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The nvidia-smi tool can be used to set + * the compute mode for * devices. + * Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCoredumpSetAttributeGlobal, + * ::cuCoredumpSetAttribute, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); + +/** + * \brief Create a CUDA context with execution affinity + * + * Creates a new CUDA context with execution affinity and associates it with + * the calling thread. The \p paramsArray and \p flags parameter are described below. + * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must + * call ::cuCtxDestroy() when done using the context. If a context is already + * current to the thread, it is supplanted by the newly created context and may + * be restored by a subsequent call to ::cuCtxPopCurrent(). + * + * The type and the amount of execution resource the context can use is limited by \p paramsArray + * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams + * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, + * the latter execution affinity parameter overrides the former execution affinity parameter. + * The supported execution affinity types are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion + * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally + * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution + * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute + * is only supported under Volta+ MPS. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage.
+ * Deprecated: This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). + * + * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally + * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can + * be set during context creation to instruct CUDA to create a coredump if + * this context raises an exception during execution. These environment variables + * are described in the CUDA-GDB user guide under the "GPU core dump support" + * section. + * The initial attributes will be taken from the global attributes at the time of + * context creation. The other attributes that control coredump output can be + * modified by calling ::cuCoredumpSetAttribute from the created context after + * it becomes current. + * + * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not + * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment + * variables, this flag can be set during context creation to instruct CUDA to + * create a coredump if data is written to a certain pipe that is present in the + * OS space. These environment variables are described in the CUDA-GDB user + * guide under the "GPU core dump support" section. + * It is important to note that the pipe name *must* be set with + * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is + * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set. + * The initial attributes will be taken from the global attributes at the time of + * context creation. The other attributes that control coredump output can be + * modified by calling ::cuCoredumpSetAttribute from the created context after + * it becomes current. + * Setting this flag on any context creation is equivalent to setting the + * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The nvidia-smi tool can be used to set + * the compute mode for * devices. + * Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param paramsArray - Execution affinity parameters + * \param numParams - Number of execution affinity parameters + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCoredumpSetAttributeGlobal, + * ::cuCoredumpSetAttribute, + * ::CUexecAffinityParam + */ +CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); + +/** + * \brief Destroy a CUDA context + * + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be + * destroyed regardless of how many threads it is current to. + * It is the responsibility of the calling function to ensure that no API + * call issues using \p ctx while ::cuCtxDestroy() is executing. + * + * Destroys and cleans up all resources associated with the context. + * It is the caller's responsibility to ensure that the context or its resources + * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior. + * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent, + * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref, + * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore. + * + * If \p ctx is current to the calling thread then \p ctx will also be + * popped from the current thread's context stack (as though ::cuCtxPopCurrent() + * were called). If \p ctx is current to other threads, then \p ctx will + * remain current to those threads, and attempting to access \p ctx from + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); + +/** + * \brief Pushes a context on the current CPU thread + * + * Pushes the given context \p ctx onto the CPU thread's stack of current + * contexts. The specified context becomes the CPU thread's current context, so + * all CUDA functions that operate on the current context are affected. + * + * The previous current context may be made current again by calling + * ::cuCtxDestroy() or ::cuCtxPopCurrent(). + * + * \param ctx - Context to push + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + +/** + * \brief Pops the current CUDA context from the current CPU thread. + * + * Pops the current CUDA context from the CPU thread and passes back the + * old context handle in \p *pctx. That context may then be made current + * to a different CPU thread by calling ::cuCtxPushCurrent(). + * + * If a context was current to the CPU thread before ::cuCtxCreate() or + * ::cuCtxPushCurrent() was called, this function makes that context current to + * the CPU thread again. + * + * \param pctx - Returned popped context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + +/** + * \brief Binds the specified CUDA context to the calling CPU thread + * + * Binds the specified CUDA context to the calling CPU thread. + * If \p ctx is NULL then the CUDA context previously bound to the + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. + * + * If there exists a CUDA context stack on the calling CPU thread, this + * will replace the top of that stack with \p ctx. + * If \p ctx is NULL then this will be equivalent to popping the top + * of the calling CPU thread's CUDA context stack (or a no-op if the + * calling CPU thread's CUDA context stack is empty). + * + * \param ctx - Context to bind to the calling CPU thread + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa + * ::cuCtxGetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaSetDevice + */ +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); + +/** + * \brief Returns the CUDA context bound to the calling CPU thread. + * + * Returns in \p *pctx the CUDA context bound to the calling CPU thread. + * If no context is bound to the calling CPU thread then \p *pctx is + * set to NULL and ::CUDA_SUCCESS is returned. + * + * \param pctx - Returned context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * \notefnerr + * + * \sa + * ::cuCtxSetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); + +/** + * \brief Returns the device ID for the current context + * + * Returns in \p *device the ordinal of the current context's device. + * + * \param device - Returned device ID for the current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); + +/** + * \brief Returns the flags for the current context + * + * Returns in \p *flags the flags of the current context. See ::cuCtxCreate + * for flag values. + * + * \param flags - Pointer to store flags of current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetCurrent, + * ::cuCtxGetDevice, + * ::cuCtxGetLimit, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxGetStreamPriorityRange, + * ::cuCtxSetFlags, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); + +/** + * \brief Sets the flags for the current context + * + * \param flags - Flags to set on the current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetCurrent, + * ::cuCtxGetDevice, + * ::cuCtxGetLimit, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxGetStreamPriorityRange, + * ::cuCtxGetFlags, + * ::cudaGetDeviceFlags, + * ::cuDevicePrimaryCtxSetFlags, + */ +CUresult CUDAAPI cuCtxSetFlags(unsigned int flags); + +/** + * \brief Returns the unique Id associated with the context supplied + * + * Returns in \p ctxId the unique Id which is associated with a given context. + * The Id is unique for the life of the program for this instance of CUDA. + * If context is supplied as NULL and there is one current, the Id of the + * current context is returned. + * + * \param ctx - Context for which to obtain the Id + * \param ctxId - Pointer to store the Id of the context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPushCurrent + */ +CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId); + +/** + * \brief Block for a context's tasks to complete + * + * Blocks until the device has completed all preceding requested tasks. + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the + * CPU thread will block until the GPU context has finished its work. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cudaDeviceSynchronize + */ +CUresult CUDAAPI cuCtxSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the context. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly + * what the limit has been set to. + * + * Setting each ::CUlimit has its own specific restrictions, so each is + * discussed here. + * + * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. + * The driver automatically increases the per-thread stack size + * for each kernel launch as needed. This size isn't reset back to the + * original value after each launch. Setting this value will take effect + * immediately, and if necessary, the device will block until all preceding + * requested tasks are complete. + * + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used + * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE + * must be performed before launching any kernel that uses the ::printf() + * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel + * that uses the ::malloc() or ::free() device system calls, otherwise + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of + * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the driver to reserve large amounts of device + * memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cuCtxSetLimit() will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability < 9.0. + * Attempting to set this limit on devices of other compute capability + * versions will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of + * outstanding device runtime launches that can be made from the current + * context. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the driver to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cuCtxSetLimit() will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. + * Values can range from 0B to 128B. This is purely a performance hint and + * it can be ignored or clamped depending on the platform. + * + * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes available for + * persisting L2 cache. This is purely a performance hint and it can be + * ignored or clamped depending on the platform. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSynchronize, + * ::cudaDeviceSetLimit + */ +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * Returns in \p *pvalue the current size of \p limit. The supported + * ::CUlimit values are: + * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the + * ::printf() device system call. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the + * ::malloc() and ::free() device system calls. + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread + * can issue the device runtime call ::cudaDeviceSynchronize() to wait on + * child grid launches to complete. + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding + * device runtime launches that can be made from this context. + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. + * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes + * + * \param limit - Limit to query + * \param pvalue - Returned size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetLimit + */ +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); + +/** + * \brief Returns the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this function returns through \p pconfig the preferred cache configuration + * for the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute functions. + * + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param pconfig - Returned cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetCacheConfig + */ +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); + +/** + * \brief Sets the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute the function. Any function preference + * set via ::cuFuncSetCacheConfig() or ::cuKernelSetCacheConfig() will be preferred over this context-wide + * setting. Setting the context-wide cache configuration to + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer + * to not change the cache configuration unless required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetCacheConfig, + * ::cuKernelSetCacheConfig + */ +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); + +/** + * \brief Returns the current shared memory configuration for the current context. + * + * This function will return in \p pConfig the current size of shared memory banks + * in the current context. On devices with configurable shared memory banks, + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is + * four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will + * eight bytes. + * + * \param pConfig - returned shared memory configuration + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetSharedMemConfig + */ +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current context. + * + * On devices with configurable shared memory banks, this function will set + * the context's shared memory bank size which is used for subsequent kernel + * launches. + * + * Changed the shared memory configuration between launches may insert a device + * side synchronization point between those launches. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial + * setting (currently, four bytes). + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes. + * + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetSharedMemConfig + */ +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); + +/** + * \brief Gets the context's API version. + * + * Returns a version number in \p version corresponding to the capabilities of + * the context (e.g. 3010 or 3020), which library developers can use to direct + * callers to a specific API version. If \p ctx is NULL, returns the API version + * used to create the currently bound context. + * + * Note that new API versions are only introduced when context capabilities are + * changed that break binary compatibility, so the API version and driver version + * may be different. For example, it is valid for the API version to be 3020 while + * the driver version is 4020. + * + * \param ctx - Context to check + * \param version - Pointer to version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cuStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cuDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetStreamPriorityRange + */ +CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** + * \brief Resets all persisting lines in cache to normal status. + * + * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal + * status. Takes effect on function return. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); + +/** + * \brief Returns the execution affinity setting for the current context. + * + * Returns in \p *pExecAffinity the current value of \p type. The supported + * ::CUexecAffinityType values are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. + * + * \param type - Execution affinity type to query + * \param pExecAffinity - Returned execution affinity + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY + * \notefnerr + * + * \sa + * ::CUexecAffinityParam + */ +CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); + + +/** @} */ /* END CUDA_CTX */ + +/** + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated context management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Increment a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Increments the usage count of the context and passes back a context handle + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is + * done with the context. ::cuCtxAttach() fails if there is no context current + * to the thread. + * + * Currently, the \p flags parameter must be 0. + * + * \param pctx - Returned context handle of the current context + * \param flags - Context attach flags (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxDetach, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); + +/** + * \brief Decrement a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Decrements the usage count of the context \p ctx, and destroys the context + * if the usage count goes to 0. The context must be a handle that was passed + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the + * calling thread. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); + +/** @} */ /* END CUDA_CTX_DEPRECATED */ + + +/** + * \defgroup CUDA_MODULE Module Management + * + * ___MANBRIEF___ module management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the module management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Loads a compute module + * + * Takes a filename \p fname and loads the corresponding module \p module into + * the current context. The CUDA driver API does not attempt to lazily + * allocate the resources needed by a module; if the memory for functions and + * data (constant and global) needed by the module cannot be allocated, + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. + * + * \param module - Returned module + * \param fname - Filename of module to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_FILE_NOT_FOUND, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); + +/** + * \brief Load a module's data + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. + * + * \param module - Returned module + * \param image - Module data to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); + +/** + * \brief Load a module's data with options + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. Options are passed as + * an array via \p options and any corresponding parameters are passed in + * \p optionValues. The number of total options is supplied via \p numOptions. + * Any outputs will be returned via \p optionValues. + * + * \param module - Returned module + * \param image - Module data to load + * \param numOptions - Number of options + * \param options - Options for JIT + * \param optionValues - Option values for JIT + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Load a module's data + * + * Takes a pointer \p fatCubin and loads the corresponding module \p module + * into the current context. The pointer represents a fat binary object, + * which is a collection of different \e cubin and/or \e PTX files, all + * representing the same device code, but compiled and optimized for different + * architectures. + * + * Prior to CUDA 4.0, there was no documented API for constructing and using + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary + * objects can be constructed by providing the -fatbin option to \b nvcc. + * More information can be found in the \b nvcc document. + * + * \param module - Returned module + * \param fatCubin - Fat binary to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); + +/** + * \brief Unloads a module + * + * Unloads a module \p hmod from the current context. + * + * \param hmod - Module to unload + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_destroy_ub + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary + */ +CUresult CUDAAPI cuModuleUnload(CUmodule hmod); + +/** + * CUDA Lazy Loading status + */ +typedef enum CUmoduleLoadingMode_enum { + CU_MODULE_EAGER_LOADING = 0x1, /**< Lazy Kernel Loading is not enabled */ + CU_MODULE_LAZY_LOADING = 0x2, /**< Lazy Kernel Loading is enabled */ +} CUmoduleLoadingMode; + +/** + * \brief Query lazy loading mode + * + * Returns lazy loading mode + * Module loading mode is controlled by CUDA_MODULE_LOADING env variable + * + * \param mode - Returns the lazy loading mode + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa + * ::cuModuleLoad, + */ +CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode); + +/** + * \brief Returns a function handle + * + * Returns in \p *hfunc the handle of the function of name \p name located in + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() + * returns ::CUDA_ERROR_NOT_FOUND. + * + * \param hfunc - Returned function handle + * \param hmod - Module to retrieve function from + * \param name - Name of function to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + +/** + * \brief Returns a global pointer from a module + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the + * global of name \p name located in module \p hmod. If no variable of that name + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. + * One of the parameters \p dptr or \p bytes (not both) can be NULL in which + * case it is ignored. + * + * \param dptr - Returned global device pointer + * \param bytes - Returned global size in bytes + * \param hmod - Module to retrieve global from + * \param name - Name of global to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSymbolAddress, + * ::cudaGetSymbolSize + */ +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); + +/** + * \brief Creates a pending JIT linker invocation. + * + * If the call is successful, the caller owns the returned CUlinkState, which + * should eventually be destroyed with ::cuLinkDestroy. The + * device code machine size (32 or 64 bit) will match the calling application. + * + * Both linker and compiler options may be specified. Compiler options will + * be applied to inputs to this linker action which must be compiled from PTX. + * The options ::CU_JIT_WALL_TIME, + * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + * will accumulate data until the CUlinkState is destroyed. + * + * \p optionValues must remain valid for the life of the CUlinkState if output + * options are used. No other references to inputs are maintained after this + * call returns. + * + * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted + * + * \param numOptions Size of options arrays + * \param options Array of linker and compiler options + * \param optionValues Array of option values, each cast to void * + * \param stateOut On success, this will contain a CUlinkState to specify + * and complete this action + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + +/** + * \brief Add an input to a pending linker invocation + * + * Ownership of \p data is retained by the caller. No reference is retained to any + * inputs after this call returns. + * + * This method accepts only compiler options, which are used if the data must + * be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted + * + * \param state A pending linker action. + * \param type The type of the input data. + * \param data The input data. PTX must be NULL-terminated. + * \param size The length of the input data. + * \param name An optional name for this input in log messages. + * \param numOptions Size of options. + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). + * \param optionValues Array of option values, each cast to void *. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Add a file input to a pending linker invocation + * + * No reference is retained to any inputs after this call returns. + * + * This method accepts only compiler options, which are used if the input + * must be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * This method is equivalent to invoking ::cuLinkAddData on the contents + * of the file. + * + * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted + * + * \param state A pending linker action + * \param type The type of the input data + * \param path Path to the input file + * \param numOptions Size of options + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) + * \param optionValues Array of option values, each cast to void * + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_FILE_NOT_FOUND + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Complete a pending linker invocation + * + * Completes the pending linker action and returns the cubin image for the linked + * device code, which can be used with ::cuModuleLoadData. The cubin is owned by + * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. + * This call does not destroy \p state. + * + * \param state A pending linker invocation + * \param cubinOut On success, this will point to the output image + * \param sizeOut Optional parameter to receive the size of the generated image + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkDestroy, + * ::cuModuleLoadData + */ +CUresult CUDAAPI +cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); + +/** + * \brief Destroys state for a JIT linker invocation. + * + * \param state State object for the linker invocation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa ::cuLinkCreate + */ +CUresult CUDAAPI +cuLinkDestroy(CUlinkState state); + +/** @} */ /* END CUDA_MODULE */ + +/** + * \defgroup CUDA_MODULE_DEPRECATED Module Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated module management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated module management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns a handle to a texture reference + * + * \deprecated + * + * Returns in \p *pTexRef the handle of the texture reference of name \p name + * in the module \p hmod. If no texture reference of that name exists, + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference + * handle should not be destroyed, since it will be destroyed when the module + * is unloaded. + * + * \param pTexRef - Returned texture reference + * \param hmod - Module to retrieve texture reference from + * \param name - Name of texture reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa + * ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetSurfRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); + +/** + * \brief Returns a handle to a surface reference + * + * \deprecated + * + * Returns in \p *pSurfRef the handle of the surface reference of name \p name + * in the module \p hmod. If no surface reference of that name exists, + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pSurfRef - Returned surface reference + * \param hmod - Module to retrieve surface reference from + * \param name - Name of surface reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa + * ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); + +/** @} */ /* END CUDA_MODULE_DEPRECATED */ + +/** + * \defgroup CUDA_LIBRARY Library Management + * + * ___MANBRIEF___ library management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the library management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Load a library with specified code and options + * + * Takes a pointer \p code and loads the corresponding library \p library into + * all contexts existent at the time of the call and future contexts at the time + * of creation until the library is unloaded with ::cuLibraryUnload(). + * + * The pointer may be obtained by mapping a \e cubin or \e PTX or \e fatbin file, + * passing a \e cubin or \e PTX or \e fatbin file as a NULL-terminated text string, or + * incorporating a \e cubin or \e fatbin object into the executable resources and + * using operating system calls such as Windows \c FindResource() to obtain the pointer. + * + * Options are passed as an array via \p jitOptions and any corresponding parameters are passed in + * \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions. + * Any outputs will be returned via \p jitOptionsValues. + * + * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in + * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions. + * + * \param library - Returned library + * \param code - Code to load + * \param jitOptions - Options for JIT + * \param jitOptionsValues - Option values for JIT + * \param numJitOptions - Number of options + * \param libraryOptions - Options for loading + * \param libraryOptionValues - Option values for loading + * \param numLibraryOptions - Number of options for loading + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * + * \sa ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx + */ +CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code, + CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, + CUlibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions); + +/** + * \brief Load a library with specified file and options + * + * Takes a filename \p fileName and loads the corresponding library \p library into + * all contexts existent at the time of the call and future contexts at the time of + * creation until the library is unloaded with ::cuLibraryUnload(). + * + * The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either + * as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc + * from toolchain 4.0 or later. + * + * Options are passed as an array via \p jitOptions and any corresponding parameters are + * passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions. + * Any outputs will be returned via \p jitOptionsValues. + * + * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in + * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions. + * + * \param library - Returned library + * \param fileName - File to load from + * \param jitOptions - Options for JIT + * \param jitOptionsValues - Option values for JIT + * \param numJitOptions - Number of options + * \param libraryOptions - Options for loading + * \param libraryOptionValues - Option values for loading + * \param numLibraryOptions - Number of options for loading + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryUnload, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx + */ +CUresult CUDAAPI cuLibraryLoadFromFile(CUlibrary *library, const char *fileName, + CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, + CUlibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions); + +/** + * \brief Unloads a library + * + * Unloads the library specified with \p library + * + * \param library - Library to unload + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuLibraryUnload(CUlibrary library); + +/** + * \brief Returns a kernel handle + * + * Returns in \p pKernel the handle of the kernel with name \p name located in library \p library. + * If kernel handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pKernel - Returned kernel handle + * \param library - Library to retrieve kernel from + * \param name - Name of kernel to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_FOUND, + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuKernelGetFunction, + * ::cuLibraryGetModule, + * ::cuModuleGetFunction + */ +CUresult CUDAAPI cuLibraryGetKernel(CUkernel *pKernel, CUlibrary library, const char *name); + +/** + * \brief Returns a module handle + * + * Returns in \p pMod the module handle associated with the current context located in + * library \p library. If module handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pMod - Returned module handle + * \param library - Library to retrieve module from + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuModuleGetFunction + */ +CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library); + +/** + * \brief Returns a function handle + * + * Returns in \p pFunc the handle of the function for the requested kernel \p kernel and + * the current context. If function handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pFunc - Returned function handle + * \param kernel - Kernel to retrieve function for the requested context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuLibraryGetKernel, + * ::cuLibraryGetModule, + * ::cuModuleGetFunction + */ +CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel); + +/** + * \brief Returns a global device pointer + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the global with + * name \p name for the requested library \p library and the current context. + * If no global for the requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND. + * One of the parameters \p dptr or \p bytes (not both) can be NULL in which + * case it is ignored. + * + * \param dptr - Returned global device pointer for the requested context + * \param bytes - Returned global size in bytes + * \param library - Library to retrieve global from + * \param name - Name of global to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuLibraryGetModule, + * cuModuleGetGlobal + */ +CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name); + +/** + * \brief Returns a pointer to managed memory + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with + * name \p name for the requested library \p library. If no managed memory with the + * requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND. One of the parameters + * \p dptr or \p bytes (not both) can be NULL in which case it is ignored. + * Note that managed memory for library \p library is shared across devices and is registered + * when the library is loaded into atleast one context. + * + * \note The API requires a CUDA context to be present and initialized on at least one device. + * If no context is present, the call returns ::CUDA_ERROR_NOT_FOUND. + * + * \param dptr - Returned pointer to the managed memory + * \param bytes - Returned memory size in bytes + * \param library - Library to retrieve managed memory from + * \param name - Name of managed memory to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_FOUND, + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + */ +CUresult CUDAAPI cuLibraryGetManaged(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name); + +/** + * \brief Returns a pointer to a unified function + * + * Returns in \p *fptr the function pointer to a unified function denoted by \p symbol. + * If no unified function with name \p symbol exists, the call returns ::CUDA_ERROR_NOT_FOUND. + * If there is no device with attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS present in the system, + * the call may return ::CUDA_ERROR_NOT_FOUND. + * + * \param fptr - Returned pointer to a unified function + * \param library - Library to retrieve function pointer memory from + * \param symbol - Name of function pointer to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_FOUND, + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + */ +CUresult CUDAAPI cuLibraryGetUnifiedFunction(void **fptr, CUlibrary library, const char *symbol); + +/** + * \brief Returns information about a kernel + * + * Returns in \p *pi the integer value of the attribute \p attrib for the kernel + * \p kernel for the requested device \p dev. The supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + * per block, beyond which a launch of the kernel would fail. This number + * depends on both the kernel and the requested device. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory per block required by this kernel. + * This does not include dynamically-allocated shared memory requested by + * the user at runtime. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + * constant memory required by this kernel. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + * used by each thread of this kernel. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + * of this kernel. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + * which the kernel was compiled. This value is the major PTX version * 10 + * + the minor PTX version, so a PTX version 1.3 function would return the + * value 13. Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + * which the kernel was compiled. This value is the major binary + * version * 10 + the minor binary version, so a binary version 1.3 function + * would return the value 13. Note that this will return a value of 10 for + * legacy cubins that do not have a properly-encoded binary architecture + * version. + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the kernel has + * been compiled with user specified option "-Xptxas --dlcm=ca" set. + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of + * dynamically-allocated shared memory. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + * cache split ratio in percent of total shared memory. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the + * kernel must launch with a valid cluster size specified. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether + * the function can be launched with non-portable cluster size. 1 is allowed, + * 0 is disallowed. A non-portable cluster size may only function on the + * specific SKUs the program is tested on. The launch might fail if the + * program is run on a different hardware platform. CUDA API provides + * cudaOccupancyMaxActiveClusters to assist with checking whether the desired + * size can be launched on the current device. A portable cluster size is + * guaranteed to be functional on all compute capabilities higher than the + * target compute capability. The portable cluster size for sm_90 is 8 blocks + * per cluster. This value may increase for future compute capabilities. The + * specific hardware unit may support higher cluster sizes that’s not + * guaranteed to be portable. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + * scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + * + * \note If another thread is trying to set the same attribute on the same device using + * ::cuKernelSetAttribute() simultaneously, the attribute query will give the old or new + * value depending on the interleavings chosen by the OS scheduler and memory consistency. + * + * \param pi - Returned attribute value + * \param attrib - Attribute requested + * \param kernel - Kernel to query attribute of + * \param dev - Device to query attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuKernelSetAttribute, + * ::cuLibraryGetKernel, + * ::cuLaunchKernel, + * ::cuKernelGetFunction, + * ::cuLibraryGetModule, + * ::cuModuleGetFunction, + * ::cuFuncGetAttribute + */ +CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev); + +/** + * \brief Sets information about a kernel + * + * This call sets the value of a specified attribute \p attrib on the kernel \p kernel + * for the requested device \p dev to an integer value specified by \p val. + * This function returns CUDA_SUCCESS if the new value of the attribute could be + * successfully set. If the set fails, this call will return an error. + * Not all attributes can have values set. Attempting to set a value on a read-only + * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + * + * Note that attributes set using ::cuFuncSetAttribute() will override the attribute + * set by this API irrespective of whether the call to ::cuFuncSetAttribute() is made + * before or after this API call. However, ::cuKernelGetAttribute() will always + * return the attribute value set by this API. + * + * Supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This is the maximum size in bytes of + * dynamically-allocated shared memory. The value should contain the requested + * maximum size of dynamically-allocated shared memory. The sum of this value and + * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + * The maximal size of requestable dynamic shared memory may differ by GPU + * architecture. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + * cache and shared memory use the same hardware resources, this sets the shared memory + * carveout preference, in percent of the total shared memory. + * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + * scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + * + * \note The API has stricter locking requirements in comparison to its legacy counterpart + * ::cuFuncSetAttribute() due to device-wide semantics. If multiple threads are trying to + * set the same attribute on the same device simultaneously, the attribute setting will depend + * on the interleavings chosen by the OS scheduler and memory consistency. + * + * \param attrib - Attribute requested + * \param val - Value to set + * \param kernel - Kernel to set attribute of + * \param dev - Device to set attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuKernelGetAttribute, + * ::cuLibraryGetKernel, + * ::cuLaunchKernel, + * ::cuKernelGetFunction, + * ::cuLibraryGetModule, + * ::cuModuleGetFunction, + * ::cuFuncSetAttribute + */ +CUresult CUDAAPI cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev); + +/** + * \brief Sets the preferred cache configuration for a device kernel. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the device kernel \p kernel on the requested device \p dev. This is only a preference. + * The driver will use the requested configuration if possible, but it is free to choose a different + * configuration if required to execute \p kernel. Any context-wide preference + * set via ::cuCtxSetCacheConfig() will be overridden by this per-kernel + * setting. + * + * Note that attributes set using ::cuFuncSetCacheConfig() will override the attribute + * set by this API irrespective of whether the call to ::cuFuncSetCacheConfig() is made + * before or after this API call. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \note The API has stricter locking requirements in comparison to its legacy counterpart + * ::cuFuncSetCacheConfig() due to device-wide semantics. If multiple threads are trying to + * set a config on the same device simultaneously, the cache config setting will depend + * on the interleavings chosen by the OS scheduler and memory consistency. + * + * \param kernel - Kernel to configure cache for + * \param config - Requested cache configuration + * \param dev - Device to set attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLibraryLoadData, + * ::cuLibraryLoadFromFile, + * ::cuLibraryUnload, + * ::cuLibraryGetKernel, + * ::cuKernelGetFunction, + * ::cuLibraryGetModule, + * ::cuModuleGetFunction, + * ::cuFuncSetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice dev); + +/** @} */ /* END CUDA_LIBRARY */ + +/** + * \defgroup CUDA_MEM Memory Management + * + * ___MANBRIEF___ memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets free and total memory + * + * Returns in \p *total the total amount of memory available to the the current context. + * Returns in \p *free the amount of memory on the device that is free according to the OS. + * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. + * In a multi-tenet situation, free estimate returned is prone to race condition where + * a new allocation/free done by a different process or a different thread in the same + * process between the time when free memory was estimated and reported, will result in + * deviation in free value reported and actual free memory. + * + * The integrated GPU on Tegra shares memory with CPU and other component + * of the SoC. The free and total values returned by the API excludes + * the SWAP memory space maintained by the OS on some platforms. + * The OS may move some of the memory pages into swap area as the GPU or + * CPU allocate or access memory. See Tegra app note on how to calculate + * total and free memory on Tegra. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Allocates device memory + * + * Allocates \p bytesize bytes of linear memory on the device and returns in + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc + */ +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); + +/** + * \brief Allocates pitched device memory + * + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on + * the device and returns in \p *dptr a pointer to the allocated memory. The + * function may pad the allocation to ensure that corresponding pointers in + * any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. \p ElementSizeBytes + * specifies the size of the largest reads and writes that will be performed + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced + * memory transactions are not possible on other data sizes). If + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, + * the kernel will run correctly, but possibly at reduced speed. The pitch + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the + * allocation. The intended usage of pitch is as a separate parameter of the + * allocation, used to compute addresses within the 2D array. Given the row + * and column of an array element of type \b T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; + * \endcode + * + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is + * recommended that programmers consider performing pitch allocations using + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing 2D memory copies + * between different regions of device memory (whether linear memory or CUDA + * arrays). + * + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed + * to match or exceed the alignment requirement for texture binding with + * ::cuTexRefSetAddress2D(). + * + * \param dptr - Returned device pointer + * \param pPitch - Returned pitch of allocation in bytes + * \param WidthInBytes - Requested allocation width in bytes + * \param Height - Requested allocation height in rows + * \param ElementSizeBytes - Size of largest reads/writes for range + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocPitch + */ +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); + +/** + * \brief Frees device memory + * + * Frees the memory space pointed to by \p dptr, which must have been returned + * by a previous call to one of the following memory allocation APIs - ::cuMemAlloc(), + * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync() + * + * Note - This API will not perform any implict synchronization when the pointer was allocated with + * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the + * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users + * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator. + * + * \param dptr - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync, ::cuMemAllocFromPoolAsync, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D, ::cuMemcpy3DAsync, + * ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFree + */ +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); + +/** + * \brief Get information on memory allocations + * + * Returns the base address in \p *pbase and size in \p *psize of the + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one + * of them is NULL, it is ignored. + * + * \param pbase - Returned base address + * \param psize - Returned size of device memory allocation + * \param dptr - Device pointer to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the virtual memory ranges + * allocated with this function and automatically accelerates calls to + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by + * the device, it can be read or written with much higher bandwidth than + * pageable memory obtained with functions such as ::malloc(). Allocating + * excessive amounts of memory with ::cuMemAllocHost() may degrade system + * performance, since it reduces the amount of memory available to the system + * for paging. As a result, this function is best used sparingly to allocate + * staging areas for data exchange between host and device. + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * The device pointer that may be used to access this host memory from those + * contexts is always equal to the returned host pointer \p *pp. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocHost + */ +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); + +/** + * \brief Frees page-locked host memory + * + * Frees the memory space pointed to by \p p, which must have been returned by + * a previous call to ::cuMemAllocHost(). + * + * \param p - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeHost + */ +CUresult CUDAAPI cuMemFreeHost(void *p); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between + * host and device. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined + * (WC). WC memory can be transferred across the PCI Express bus more + * quickly on some system configurations, but cannot be read efficiently by + * most CPUs. WC memory is a good option for buffers that will be written by + * the CPU and read by the GPU via mapped pinned memory or host->device + * transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. + * + * The memory allocated by this function must be freed with ::cuMemFreeHost(). + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + * that may be used to access this host memory from those contexts is always equal + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED + * is specified, then the function ::cuMemHostGetDevicePointer() must be used + * to query the device pointer, even if the context supports unified addressing. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostAlloc + */ +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); + +/** + * \brief Passes back device pointer of mapped pinned memory + * + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned + * host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP + * flag was not specified at the time the memory was allocated, or if the + * function is called on a GPU that does not support mapped pinned memory. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p p and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p p. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only one of the two pointers and not both. + * + * \p Flags provides for future releases. For now, it must be set to 0. + * + * \param pdptr - Returned device pointer + * \param p - Host pointer + * \param Flags - Options (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostGetDevicePointer + */ +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); + +/** + * \brief Passes back flags that were used for a pinned allocation + * + * Passes back the flags \p pFlags that were specified when allocating + * the pinned host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetFlags() will fail if the pointer does not reside in + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). + * + * \param pFlags - Returned flags word + * \param p - Host pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuMemAllocHost, + * ::cuMemHostAlloc, + * ::cudaHostGetFlags + */ +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p bytesize bytes of managed memory on the device and returns in + * \p *dptr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support + * for managed memory can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If + * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from + * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to + * ::cuStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cuStreamAttachMemAsync to + * a single stream, the default association as specified during ::cuMemAllocManaged + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cuMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a + * non-zero value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all contexts created in + * that process on devices that support managed memory have to be peer-to-peer compatible + * with each other. Context creation will fail if a context is created on a device that + * supports managed memory and is not peer-to-peer compatible with any of the other + * managed memory supporting devices on which contexts were previously created, even if + * those contexts have been destroyed. These environment variables are described + * in the CUDA programming guide under the "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, + * ::cudaMallocManaged + */ +CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given a PCI bus ID string. + * + * \param dev - Returned device handle + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetPCIBusId, + * ::cudaDeviceGetByPCIBusId + */ +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetByPCIBusId, + * ::cudaDeviceGetPCIBusId + */ +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + * flags set. This opaque handle may be copied into other processes and + * opened with ::cuIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been opened in the importing process, + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + * ::cuEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * Users can test their device for IPC functionality by calling + * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED + * + * \param pHandle - Pointer to a user allocated CUipcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and + * ::CU_EVENT_DISABLE_TIMING flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetEventHandle + */ +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + * This event must be freed with ::cuEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * Users can test their device for IPC functionality by calling + * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED + * + * \param phEvent - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcGetEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcOpenEventHandle + */ +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cuMemAlloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cuMemFree and a subsequent call + * to ::cuMemAlloc returns memory with the same device address, + * ::cuIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * Users can test their device for IPC functionality by calling + * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED + * + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return + * the handle in. + * \param dptr - Base pointer to previously allocated device memory + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetMemHandle + */ +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cuIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cuIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + * ::cuDeviceCanAccessPeer can determine if a mapping is possible. + * + * Contexts that may open ::CUipcMemHandles are restricted in the following way. + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + * by one ::CUcontext per ::CUdevice per other process. + * + * If the memory handle has already been opened by the current context, the + * reference count on the handle is incremented by 1 and the existing device pointer + * is returned. + * + * Memory returned from ::cuIpcOpenMemHandle must be freed with + * ::cuIpcCloseMemHandle. + * + * Calling ::cuMemFree on an exported memory region before calling + * ::cuIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * Users can test their device for IPC functionality by calling + * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED + * + * \param pdptr - Returned device pointer + * \param handle - ::CUipcMemHandle to open + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \note No guarantees are made about the address returned in \p *pdptr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcCloseMemHandle, + * ::cuCtxEnablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaIpcOpenMemHandle + */ +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + +/** + * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle + * + * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. + * When the reference count reaches 0, this API unmaps the memory. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * Users can test their device for IPC functionality by calling + * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED + * + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); + +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p p and \p bytesize and maps it + * for the device(s) as specified by \p Flags. This memory range also is added + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES + * is true, ::cuMemHostRegister will not page-lock the memory range specified + * by \p ptr but only populate unpopulated pages. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some + * I/O memory space, e.g. the PCI Express resource of a 3rd party device. + * + * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory + * that is considered read-only by the device. On platforms without + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is + * required in order to register memory mapped to the CPU as read-only. Support + * for the use of this flag can be queried from the device attribute + * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with + * a current context associated with a device that does not have this attribute + * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p ptr and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p ptr. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * The memory page-locked by this function must be unregistered with + * ::cuMemHostUnregister(). + * + * \param p - Host pointer to memory to page-lock + * \param bytesize - Size in bytes of the address range to page-lock + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::cuMemHostUnregister, + * ::cuMemHostGetFlags, + * ::cuMemHostGetDevicePointer, + * ::cudaHostRegister + */ +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + +/** + * \brief Unregisters a memory range that was registered with cuMemHostRegister. + * + * Unmaps the memory range whose base address is specified by \p p, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cuMemHostRegister(). + * + * \param p - Host pointer to memory to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + * \notefnerr + * + * \sa + * ::cuMemHostRegister, + * ::cudaHostUnregister + */ +CUresult CUDAAPI cuMemHostUnregister(void *p); + +/** + * \brief Copies memory + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + +/** + * \brief Copies device memory between two contexts + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeer + */ +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol + */ +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Array + * + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting index of the destination data. + * \p srcDevice specifies the base pointer of the source. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Array to Device + * + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the + * base pointer of the destination and must be naturally aligned with the CUDA + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle + * and the offset in bytes into the array where the copy is to begin. + * \p ByteCount specifies the number of bytes to copy and must be evenly + * divisible by the array element size. + * + * \param dstDevice - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the destination + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies + * the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Array to Array + * + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray + * specify the handles of the destination and source CUDA arrays for the copy, + * respectively. \p dstOffset and \p srcOffset specify the destination and + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of + * bytes to be copied. The size of the elements in the CUDA arrays need not be + * the same format, but the elements must be the same size; and count must be + * evenly divisible by that size. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyArrayToArray + */ +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy3D + */ +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); + +/** + * \brief Copies memory between contexts + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeer + */ +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + +/** + * \brief Copies memory asynchronously + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies device memory between two contexts asynchronously. + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeerAsync + */ +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync + */ +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the + * destination data. \p srcHost specifies the base address of the source. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyToArrayAsync + */ +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyFromArrayAsync + */ +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync + */ +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy3DAsync + */ +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + +/** + * \brief Copies memory between contexts asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Creates a 1D or 2D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + CUarray_format Format; + unsigned int NumChannels; + } CUDA_ARRAY_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, and \p Height are the width, and height of the CUDA array (in + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional + * otherwise; + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 1; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + * \endcode + * + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit + * float16's: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + * \endcode + * + * Description for a \p width x \p height CUDA array of 16-bit elements, each + * of which is two 8-bit unsigned chars: + * \code + CUDA_ARRAY_DESCRIPTOR arrayDesc; + desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + desc.NumChannels = 2; + desc.Width = width; + desc.Height = height; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - Array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocArray + */ +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 1D or 2D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * \param pArrayDescriptor - Returned array descriptor + * \param hArray - Array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); + +/** + * \brief Returns the layout properties of a sparse CUDA array + * + * Returns the layout properties of a sparse CUDA array in \p sparseProperties + * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. + * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. + * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained + * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties + * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + * \param[in] array - CUDA array to get the sparse properties of + * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); + +/** + * \brief Returns the layout properties of a sparse CUDA mipmapped array + * + * Returns the sparse array layout properties in \p sparseProperties + * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the + * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth + * is less than that of the tile. + * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. + * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. + * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of + * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); + +/** + * \brief Returns the memory requirements of a CUDA array + * + * Returns the memory requirements of a CUDA array in \p memoryRequirements + * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size + * represents the total size of the CUDA array. + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment + * represents the alignment necessary for mapping the CUDA array. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS + * \param[in] array - CUDA array to get the memory requirements of + * \param[in] device - Device to get the memory requirements for + * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device); + +/** + * \brief Returns the memory requirements of a CUDA mipmapped array + * + * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements + * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size + * represents the total size of the CUDA mipmapped array. + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment + * represents the alignment necessary for mapping the CUDA mipmapped + * array. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS + * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of + * \param[in] device - Device to get the memory requirements for + * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device); + +/** + * \brief Gets a CUDA array plane from a CUDA array + * + * Returns in \p pPlaneArray a CUDA array that represents a single format plane + * of the CUDA array \p hArray. + * + * If \p planeIdx is greater than the maximum number of planes in this array or if the array does + * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns + * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width + * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + * + * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx + * \param hArray - Multiplanar CUDA array + * \param planeIdx - Plane index + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuArrayCreate, + * ::cudaArrayGetPlane + */ +CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); + +/** + * \brief Destroys a CUDA array + * + * Destroys the CUDA array \p hArray. + * + * \param hArray - Array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeArray + */ +CUresult CUDAAPI cuArrayDestroy(CUarray hArray); + +/** + * \brief Creates a 3D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D array is allocated if only \p Depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array + * to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. + * + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 0; + desc.Depth = 0; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + desc.Depth = 0; + * \endcode + * + * Description for a \p width x \p height x \p depth CUDA array of 64-bit, + * 4x16-bit float16's: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + desc.Depth = depth; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - 3D array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc3DArray + */ +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 3D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * This function may be called on 1D and 2D arrays, in which case the \p Height + * and/or \p Depth members of the descriptor struct will be set to 0. + * + * \param pArrayDescriptor - Returned 3D array descriptor + * \param hArray - 3D array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); + +/** + * \brief Creates a CUDA mipmapped array + * + * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. + * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D mipmapped array is allocated if only \p Depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of + * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to + * bind a mipmap level of the CUDA mipmapped array to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA mipmapped arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * + * \param pHandle - Returned mipmapped array + * \param pMipmappedArrayDesc - mipmapped array descriptor + * \param numMipmapLevels - Number of mipmap levels + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuMipmappedArrayDestroy, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaMallocMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p hMipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pLevelArray - Returned mipmap level CUDA array + * \param hMipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayDestroy, + * ::cuArrayCreate, + * ::cudaGetMipmappedArrayLevel + */ +CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); + +/** + * \brief Destroys a CUDA mipmapped array + * + * Destroys the CUDA mipmapped array \p hMipmappedArray. + * + * \param hMipmappedArray - Mipmapped array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaFreeMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); + +/** +* \brief Retrieve handle for an address range +* +* Get a handle of the specified type to an address range. The address range +* must have been obtained by a prior call to either ::cuMemAlloc or ::cuMemAddressReserve. +* If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap. +* +* Users must ensure the \p dptr and \p size are aligned to the host page size. +* +* When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, +* users are expected to query for dma_buf support for the platform +* by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling +* this API. The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor. +* Users must ensure the entire address range is backed and mapped when +* the address range is allocated by ::cuMemAddressReserve. All the physical +* allocations backing the address range must be resident on the same device and +* have identical allocation properties. Users are also expected to retrieve a +* new handle every time the underlying physical allocation(s) corresponding +* to a previously queried VA range are changed. +* +* \param[out] handle - Pointer to the location where the returned handle will be stored. +* \param[in] dptr - Pointer to a valid CUDA device allocation. Must be aligned to host page size. +* \param[in] size - Length of the address range. Must be aligned to host page size. +* \param[in] handleType - Type of handle requested (defines type and size of the \p handle output parameter) +* \param[in] flags - Reserved, must be zero +* +* \return +* CUDA_SUCCESS +* CUDA_ERROR_INVALID_VALUE +* CUDA_ERROR_NOT_SUPPORTED +*/ +CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags); + +/** @} */ /* END CUDA_MEM */ + +/** + * \defgroup CUDA_VA Virtual Memory Management + * + * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the virtual memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** +* \brief Allocate an address range reservation. +* +* Reserves a virtual address range based on the given parameters, giving +* the starting address of the range in \p ptr. This API requires a system that +* supports UVA. The size and address parameters must be a multiple of the +* host page size and the alignment must be a power of two or zero for default +* alignment. +* +* \param[out] ptr - Resulting pointer to start of virtual address range allocated +* \param[in] size - Size of the reserved virtual address range requested +* \param[in] alignment - Alignment of the reserved virtual address range requested +* \param[in] addr - Fixed starting address range requested +* \param[in] flags - Currently unused, must be zero +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemAddressFree +*/ +CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); + +/** +* \brief Free an address range reservation. +* +* Frees a virtual address range reserved by cuMemAddressReserve. The size +* must match what was given to memAddressReserve and the ptr given must +* match what was returned from memAddressReserve. +* +* \param[in] ptr - Starting address of the virtual address range to free +* \param[in] size - Size of the virtual address region to free +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemAddressReserve +*/ +CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); + +/** +* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties +* +* This creates a memory allocation on the target device specified through the +* \p prop structure. The created allocation will not have any device or host +* mappings. The generic memory \p handle for the allocation can be +* mapped to the address space of calling process via ::cuMemMap. This handle +* cannot be transmitted directly to other processes (see +* ::cuMemExportToShareableHandle). On Windows, the caller must also pass +* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which +* limits or allows access to this handle for a recipient process (see +* ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this +* allocation must be a multiple of the the value given via +* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM +* flag. +* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then +* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays +* and sparse CUDA mipmapped arrays. +* (see ::cuMemMapArrayAsync). +* +* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. +* \param[in] size - Size of the allocation requested +* \param[in] prop - Properties of the allocation to create. +* \param[in] flags - flags for future use, must be zero now. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); + +/** +* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. +* +* Frees the memory that was allocated on a device through cuMemCreate. +* +* The memory allocation will be freed when all outstanding mappings to the memory +* are unmapped and when all outstanding references to the handle (including it's +* shareable counterparts) are also released. The generic memory handle can be +* freed when there are still outstanding mappings made with this handle. Each +* time a recipient process imports a shareable handle, it needs to pair it with +* ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle +* the behavior is undefined. +* +* \param[in] handle Value of handle which was returned previously by cuMemCreate. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemCreate +*/ +CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); + +/** +* \brief Maps an allocation handle to a reserved virtual address range. +* +* Maps bytes of memory represented by \p handle starting from byte \p offset to +* \p size to address range [\p addr, \p addr + \p size]. This range must be an +* address reservation previously reserved with ::cuMemAddressReserve, and +* \p offset + \p size must be less than the size of the memory allocation. +* Both \p ptr, \p size, and \p offset must be a multiple of the value given via +* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. +* If \p handle represents a multicast object, \p ptr, \p size and \p offset must +* be aligned to the value returned by ::cuMulticastGetGranularity with the flag +* ::CU_MULTICAST_MINIMUM_GRANULARITY. For best performance however, it is +* recommended that \p ptr, \p size and \p offset be aligned to the value +* returned by ::cuMulticastGetGranularity with the flag +* ::CU_MULTICAST_RECOMMENDED_GRANULARITY. +* +* Please note calling ::cuMemMap does not make the address accessible, +* the caller needs to update accessibility of a contiguous mapped VA +* range by calling ::cuMemSetAccess. +* +* Once a recipient process obtains a shareable memory handle +* from ::cuMemImportFromShareableHandle, the process must +* use ::cuMemMap to map the memory into its address ranges before +* setting accessibility with ::cuMemSetAccess. +* +* ::cuMemMap can only create mappings on VA range reservations +* that are not currently mapped. +* +* \param[in] ptr - Address where memory will be mapped. +* \param[in] size - Size of the memory mapping. +* \param[in] offset - Offset into the memory represented by +* - \p handle from which to start mapping +* - Note: currently must be zero. +* \param[in] handle - Handle to a shareable memory +* \param[in] flags - flags for future use, must be zero now. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); + +/** + * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays + * + * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. + * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. + * The structure ::CUarrayMapInfo is defined as follow: + \code + typedef struct CUarrayMapInfo_st { + CUresourcetype resourceType; + union { + CUmipmappedArray mipmap; + CUarray array; + } resource; + + CUarraySparseSubresourceType subresourceType; + union { + struct { + unsigned int level; + unsigned int layer; + unsigned int offsetX; + unsigned int offsetY; + unsigned int offsetZ; + unsigned int extentWidth; + unsigned int extentHeight; + unsigned int extentDepth; + } sparseLevel; + struct { + unsigned int layer; + unsigned long long offset; + unsigned long long size; + } miptail; + } subresource; + + CUmemOperationType memOperationType; + + CUmemHandleType memHandleType; + union { + CUmemGenericAllocationHandle memHandle; + } memHandle; + + unsigned long long offset; + unsigned int deviceBitMask; + unsigned int flags; + unsigned int reserved[2]; + } CUarrayMapInfo; + \endcode + * + * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. + * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then + * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. + * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using + * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE + * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. + * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. + * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY + * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. + * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been + * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE + * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. + * + * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. + * ::CUarraySparseSubresourceType_enum is defined as: + \code + typedef enum CUarraySparseSubresourceType_enum { + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 + } CUarraySparseSubresourceType; + \endcode + * + * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a + * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which + * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by + * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. + * + * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL + * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. + * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY + * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. + * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight + * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. + * These offsets and extents must be aligned to the corresponding tile dimension. + * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, + * must be zero. + * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, + * must be zero. + * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth + * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. + * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties + * + * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL + * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in + * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. + * Both, mip tail offset and mip tail size must be aligned to the tile size. + * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags + * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. + * Otherwise, must be zero. + * + * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING + * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored. + * + * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: + \code + typedef enum CUmemOperationType_enum { + CU_MEM_OPERATION_TYPE_MAP = 1, + CU_MEM_OPERATION_TYPE_UNMAP = 2 + } CUmemOperationType; + \endcode + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource + * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. + * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, + * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. + * + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation + * is performed. ::CUarrayMapInfo::memHandle must be NULL. + * + * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. + * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match + * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. + * + * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \param[in] mapInfoList - List of ::CUarrayMapInfo + * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList + * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations + * + * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties + */ +CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); + +/** +* \brief Unmap the backing memory of a given address range. +* +* The range must be the entire contiguous address range that was mapped to. In +* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped +* by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed +* if there are no existing mappings and there are no unreleased memory handles. +* +* When ::cuMemUnmap returns successfully the address range is converted to an +* address reservation and can be used for a future calls to ::cuMemMap. Any new +* mapping to this virtual address will need to have access granted through +* ::cuMemSetAccess, as all mappings start with no accessibility setup. +* +* \param[in] ptr - Starting address for the virtual address range to unmap +* \param[in] size - Size of the virtual address range to unmap +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* \note_sync +* +* \sa ::cuMemCreate, ::cuMemAddressReserve +*/ +CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); + +/** +* \brief Set the access flags for each location specified in \p desc for the given virtual address range +* +* Given the virtual address range via \p ptr and \p size, and the locations +* in the array given by \p desc and \p count, set the access flags for the +* target locations. The range must be a fully mapped address range +* containing all allocations created by ::cuMemMap / ::cuMemCreate. +* When setting the access flags for a virtual address range mapping a multicast +* object, \p ptr and \p size must be aligned to the value returned by +* ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_MINIMUM_GRANULARITY. +* For best performance however, it is recommended that \p ptr and \p size be +* aligned to the value returned by ::cuMulticastGetGranularity with the flag +* ::CU_MULTICAST_RECOMMENDED_GRANULARITY. +* +* \param[in] ptr - Starting address for the virtual address range +* \param[in] size - Length of the virtual address range +* \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the +* - mapping for each location specified +* \param[in] count - Number of ::CUmemAccessDesc in \p desc +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* \note_sync +* +* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap +*/ +CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); + +/** +* \brief Get the access \p flags set for the given \p location and \p ptr +* +* \param[out] flags - Flags set for this location +* \param[in] location - Location in which to check the flags for +* \param[in] ptr - Address in which to check the access flags for +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemSetAccess +*/ +CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); + +/** +* \brief Exports an allocation to a requested shareable handle type +* +* Given a CUDA memory handle, create a shareable memory +* allocation handle that can be used to share the memory with other +* processes. The recipient process can convert the shareable handle back into a +* CUDA memory handle using ::cuMemImportFromShareableHandle and map +* it with ::cuMemMap. The implementation of what this handle is and how it +* can be transferred is defined by the requested handle type in \p handleType +* +* Once all shareable handles are closed and the allocation is released, the allocated +* memory referenced will be released back to the OS and uses of the CUDA handle afterward +* will lead to undefined behavior. +* +* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) +* that support importing memory from the shareable type +* +* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type +* \param[in] handle - CUDA handle for the memory allocation +* \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) +* \param[in] flags - Reserved, must be zero +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); + +/** +* \brief Imports an allocation from a requested shareable handle type. +* +* If the current process cannot support the memory described by this shareable +* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. +* +* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) +* created on devices under an SLI group may not be supported, and thus this API will +* return CUDA_ERROR_NOT_SUPPORTED. +* There is no guarantee that the contents of \p handle will be the same CUDA memory handle +* for the same given OS shareable handle, or the same underlying allocation. +* +* \param[out] handle - CUDA Memory handle for the memory allocation. +* \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. +* \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease +*/ +CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); + +/** +* \brief Calculates either the minimal or recommended granularity +* +* Calculates either the minimal or recommended granularity +* for a given allocation specification and returns it in granularity. This +* granularity can be used as a multiple for alignment, size, or address mapping. +* +* \param[out] granularity Returned granularity. +* \param[in] prop Property for which to determine the granularity for +* \param[in] option Determines which granularity to return +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemMap +*/ +CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); + +/** +* \brief Retrieve the contents of the property structure defining properties for this handle +* +* \param[out] prop - Pointer to a properties structure which will hold the information about this handle +* \param[in] handle - Handle which to perform the query on +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); + +/** +* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. +* +* The handle is guaranteed to be the same handle value used to map the memory. If the address +* requested is not mapped, the function will fail. The returned handle must be released with +* corresponding number of calls to ::cuMemRelease. +* +* \note The address \p addr, can be any address in a range previously mapped +* by ::cuMemMap, and not necessarily the start address. +* +* \param[out] handle CUDA Memory handle for the backing memory allocation. +* \param[in] addr Memory address to query, that has been mapped previously. +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap +*/ +CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); + +/** @} */ /* END CUDA_VA */ + +/** + * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator + * + * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. + * Functions for controlling the behavior of the underlying allocator. + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream ordered memory allocator exposed by the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_MALLOC_ASYNC_overview overview + * + * The asynchronous allocator allows the user to allocate and free in stream order. + * All asynchronous accesses of the allocation must happen between + * the stream executions of the allocation and the free. If the memory is accessed + * outside of the promised stream order, a use before allocation / use after free error + * will cause undefined behavior. + * + * The allocator is free to reallocate the memory as long as it can guarantee + * that compliant memory accesses will not overlap temporally. + * The allocator may refer to internal stream ordering as well as inter-stream dependencies + * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. + * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. + * + * \section CUDA_MALLOC_ASYNC_support Supported Platforms + * + * Whether or not a device supports the integrated stream ordered memory allocator + * may be queried by calling ::cuDeviceGetAttribute() with the device attribute + * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED + */ + +/** + * \brief Frees memory with stream ordered semantics + * + * Inserts a free operation into \p hStream. + * The allocation must not be accessed after stream execution reaches the free. + * After this API returns, accessing the memory from any subsequent work launched on the GPU + * or querying its pointer attributes results in undefined behavior. + * + * \note During stream capture, this function results in the creation of a free node and + * must therefore be passed the address of a graph allocation. + * + * \param dptr - memory to free + * \param hStream - The stream establishing the stream ordering contract. + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED + */ +CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); + +/** + * \brief Allocates memory with stream ordered semantics + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the memory pool current to the stream's device. + * + * \note The default memory pool of a device contains device memory from that device. + * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] dptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, + * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, + * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); + +/** + * \brief Tries to release memory back to the OS + * + * Releases memory back to the OS until the pool contains fewer than minBytesToKeep + * reserved bytes, or there is no more memory that the allocator can safely release. + * The allocator cannot release OS allocations that back outstanding asynchronous allocations. + * The OS allocations may happen at different granularity from the user allocations. + * + * \note: Allocations that have not been freed count as outstanding. + * \note: Allocations that have been asynchronously freed but whose completion has + * not been observed on the host (eg. by a synchronize) can count as outstanding. + * + * \param[in] pool - The memory pool to trim + * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, + * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have + * at least minBytesToKeep bytes reserved after the operation. + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); + +/** + * \brief Sets attributes of a memory pool + * + * Supported attributes are: + * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cuMemFreeAsync (default enabled). + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of backing memory that was + * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. + * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of used memory that was + * allocated for the memory pool. + * + * \param[in] pool - The memory pool to modify + * \param[in] attr - The attribute to modify + * \param[in] value - Pointer to the value to assign + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); + +/** + * \brief Gets attributes of a memory pool + * + * Supported attributes are: + * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cuMemFreeAsync (default enabled). + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. + * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application. + * + * \param[in] pool - The memory pool to get attributes of + * \param[in] attr - The attribute to get + * \param[out] value - Retrieved value + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); + +/** + * \brief Controls visibility of pools between devices + * + * \param[in] pool - The pool being modified + * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. + * \param[in] count - Number of descriptors in the map array. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); + +/** + * \brief Returns the accessibility of a pool from a device + * + * Returns the accessibility of the pool's memory from the specified location. + * + * \param[out] flags - the accessibility of the pool from the specified location + * \param[in] memPool - the pool being queried + * \param[in] location - the location accessing the pool + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); + +/** + * \brief Creates a memory pool + * + * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines + * the properties of the pool such as the backing device and IPC capabilities. + * + * By default, the pool's memory will be accessible from the device it is allocated on. + * + * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NOT_SUPPORTED + * + * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, + * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle + */ +CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); + +/** + * \brief Destroys the specified memory pool + * + * If any pointers obtained from this pool haven't been freed or + * the pool has free operations that haven't completed + * when ::cuMemPoolDestroy is invoked, the function will return immediately and the + * resources associated with the pool will be released automatically + * once there are no more outstanding allocations. + * + * Destroying the current mempool of a device sets the default mempool of + * that device as the current mempool for that device. + * + * \note A device's default memory pool cannot be destroyed. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, + * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); + +/** + * \brief Allocates memory from a specified pool with stream ordered semantics. + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the specified memory pool. + * + * \note + * - The specified memory pool may be from a device different than that of the specified \p hStream. + * + * - Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] dptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] pool - The pool to allocate from + * \param[in] hStream - The stream establishing the stream ordering semantic + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, + * ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); + +/** + * \brief Exports a memory pool to the requested handle type. + * + * Given an IPC capable mempool, create an OS handle to share the pool with another process. + * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. + * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. + * The implementation of what the shareable handle is and how it can be transferred is defined by the requested + * handle type. + * + * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. + * + * \param[out] handle_out - Returned OS handle + * \param[in] pool - pool to export + * \param[in] handleType - the type of handle to create + * \param[in] flags - must be 0 + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, + * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, + * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, + * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); + +/** + * \brief imports a memory pool from a shared handle. + * + * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. + * + * \note Imported memory pools do not support creating new allocations. + * As such imported memory pools may not be used in cuDeviceSetMemPool + * or ::cuMemAllocFromPoolAsync calls. + * + * \param[out] pool_out - Returned memory pool + * \param[in] handle - OS handle of the pool to open + * \param[in] handleType - The type of handle being imported + * \param[in] flags - must be 0 + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer + */ +CUresult CUDAAPI cuMemPoolImportFromShareableHandle( + CUmemoryPool *pool_out, + void *handle, + CUmemAllocationHandleType handleType, + unsigned long long flags); + +/** + * \brief Export data to share a memory pool allocation between processes. + * + * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. + * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. + * The data is not a handle and may be shared through any IPC mechanism. + * + * \param[out] shareData_out - Returned export data + * \param[in] ptr - pointer to memory being exported + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer + */ +CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); + +/** + * \brief Import a memory pool allocation from another process. + * + * Returns in \p ptr_out a pointer to the imported memory. + * The imported memory must not be accessed before the allocation operation completes + * in the exporting process. The imported memory must be freed from all importing processes before + * being freed in the exporting process. The pointer may be freed with cuMemFree + * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed + * on the importing process before the free operation on the exporting process. + * + * \note The cuMemFreeAsync api may be used in the exporting process before + * the cuMemFreeAsync operation completes in its stream as long as the + * cuMemFreeAsync in the exporting process specifies a stream with + * a stream dependency on the importing process's cuMemFreeAsync. + * + * \param[out] ptr_out - pointer to imported memory + * \param[in] pool - pool from which to import + * \param[in] shareData - data specifying the memory to import + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer + */ +CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); + +/** @} */ /* END CUDA_MALLOC_ASYNC */ + +/** + * \defgroup CUDA_MULTICAST Multicast Object Management + * + * ___MANBRIEF___ Functions for creating multicast objects, adding devices to them and binding/unbinding memory + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the CUDA multicast object operations exposed by the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_MULTICAST_overview overview + * + * A multicast object created via ::cuMulticastCreate enables certain memory + * operations to be broadcasted to a team of devices. Devices can be added to a + * multicast object via ::cuMulticastAddDevice. Memory can be bound on each + * participating device via either ::cuMulticastBindMem or ::cuMulticastBindAddr. + * Multicast objects can be mapped into a device's virtual address space using + * the virtual memmory management APIs (see ::cuMemMap and ::cuMemSetAccess). + * + * \section CUDA_MULTICAST_support Supported Platforms + * + * Support for multicast on a specific device can be queried using the device + * attribute ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED + */ + +/** + * \brief Create a generic allocation handle representing a multicast object described by the given properties. + * + * This creates a multicast object as described by \p prop. The number of + * participating devices is specified by ::CUmulticastObjectProp::numDevices. + * Devices can be added to the multicast object via ::cuMulticastAddDevice. + * All participating devices must be added to the multicast object before memory + * can be bound to it. Memory is bound to the multicast object via either + * ::cuMulticastBindMem or ::cuMulticastBindAddr, and can be unbound via + * ::cuMulticastUnbind. The total amount of memory that can be bound per device + * is specified by :CUmulticastObjectProp::size. This size must be a multiple of + * the value returned by ::cuMulticastGetGranularity with the flag + * ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however, the size + * should be aligned to the value returned by ::cuMulticastGetGranularity with + * the flag ::CU_MULTICAST_GRANULARITY_RECOMMENDED. + * + * After all participating devices have been added, multicast objects can also + * be mapped to a device's virtual address space using the virtual memory + * management APIs (see ::cuMemMap and ::cuMemSetAccess). Multicast objects can + * also be shared with other processes by requesting a shareable handle via + * ::cuMemExportToShareableHandle. Note that the desired types of shareable + * handles must be specified in the bitmask ::CUmulticastObjectProp::handleTypes. + * Multicast objects can be released using the virtual memory management API + * ::cuMemRelease. + * + * \param[out] mcHandle Value of handle returned. + * \param[in] prop Properties of the multicast object to create. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * + * \sa ::cuMulticastAddDevice, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind + * \sa ::cuMemCreate, ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle + */ +CUresult CUDAAPI cuMulticastCreate(CUmemGenericAllocationHandle *mcHandle, const CUmulticastObjectProp *prop); + +/** + * \brief Associate a device to a multicast object. + * + * Associates a device to a multicast object. The added device will be a part of + * the multicast team of size specified by CUmulticastObjectProp::numDevices + * during ::cuMulticastCreate. + * The association of the device to the multicast object is permanent during + * the life time of the multicast object. + * All devices must be added to the multicast team before any memory can be + * bound to any device in the team. Any calls to ::cuMulticastBindMem or + * ::cuMulticastBindAddr will block until all devices have been added. + * Similarly all devices must be added to the multicast team before a virtual + * address range can be mapped to the multicast object. A call to ::cuMemMap + * will block until all devices have been added. + * + * \param[in] mcHandle Handle representing a multicast object. + * \param[in] dev Device that will be associated to the multicast + * object. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * + * \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr + */ +CUresult CUDAAPI cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUdevice dev); + +/** + * \brief Bind a memory allocation represented by a handle to a multicast object. + * + * Binds a memory allocation specified by \p memHandle and created via + * ::cuMemCreate to a multicast object represented by \p mcHandle and created + * via ::cuMulticastCreate. The intended \p size of the bind, the offset in the + * multicast range \p mcOffset as well as the offset in the memory \p memOffset + * must be a multiple of the value returned by ::cuMulticastGetGranularity with + * the flag ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however, + * \p size, \p mcOffset and \p memOffset should be aligned to the granularity of + * the memory allocation(see ::cuMemGetAllocationGranularity) or to the value + * returned by ::cuMulticastGetGranularity with the flag + * ::CU_MULTICAST_GRANULARITY_RECOMMENDED. + * + * The \p size + \p memOffset must be smaller than the size of the allocated + * memory. Similarly the \p size + \p mcOffset must be smaller than the size + * of the multicast object. + * The memory allocation must have beeen created on one of the devices + * that was added to the multicast team via ::cuMulticastAddDevice. + * Externally shareable as well as imported multicast objects can be bound only + * to externally shareable memory. + * Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are + * insufficient resources required to perform the bind. This call may also + * return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not + * initialized or running. + * + * \param[in] mcHandle Handle representing a multicast object. + * \param[in] mcOffset Offset into the multicast object for attachment. + * \param[in] memHandle Handle representing a memory allocation. + * \param[in] memOffset Offset into the memory for attachment. + * \param[in] size Size of the memory that will be bound to the + * multicast object. + * \param[in] flags Flags for future use, must be zero for now. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_SYSTEM_NOT_READY + * + * \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate + */ +CUresult CUDAAPI cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags); + +/** + * \brief Bind a memory allocation represented by a virtual address to a multicast object. + * + * Binds a memory allocation specified by its mapped address \p memptr to a + * multicast object represented by \p mcHandle. + * The memory must have been allocated via ::cuMemCreate or ::cudaMallocAsync. + * The intended \p size of the bind, the offset in the multicast range + * \p mcOffset and \p memptr must be a multiple of the value returned by + * ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_GRANULARITY_MINIMUM. + * For best performance however, \p size, \p mcOffset and \p memptr should be + * aligned to the value returned by ::cuMulticastGetGranularity with the flag + * ::CU_MULTICAST_GRANULARITY_RECOMMENDED. + * + * The \p size must be smaller than the size of the allocated memory. + * Similarly the \p size + \p mcOffset must be smaller than the total size + * of the multicast object. + * The memory allocation must have beeen created on one of the devices + * that was added to the multicast team via ::cuMulticastAddDevice. + * Externally shareable as well as imported multicast objects can be bound only + * to externally shareable memory. + * Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are + * insufficient resources required to perform the bind. This call may also + * return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not + * initialized or running. + * + * \param[in] mcHandle Handle representing a multicast object. + * \param[in] mcOffset Offset into multicast va range for attachment. + * \param[in] memptr Virtual address of the memory allocation. + * \param[in] size Size of memory that will be bound to the + * multicast object. + * \param[in] flags Flags for future use, must be zero now. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_SYSTEM_NOT_READY + * + * \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate + */ +CUresult CUDAAPI cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags); + +/** + * \brief Unbind any memory allocations bound to a multicast object at a given offset and upto a given size. + * + * Unbinds any memory allocations hosted on \p dev and bound to a multicast + * object at \p mcOffset and upto a given \p size. + * The intended \p size of the unbind and the offset in the multicast range + * ( \p mcOffset ) must be a multiple of the value returned by + * ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM. + * The \p size + \p mcOffset must be smaller than the total size of the + * multicast object. + * + * \note + * Warning: + * The \p mcOffset and the \p size must match the corresponding values specified + * during the bind call. Any other values may result in undefined behavior. + * + * \param[in] mcHandle Handle representing a multicast object. + * \param[in] dev Device that hosts the memory allocation. + * \param[in] mcOffset Offset into the multicast object. + * \param[in] size Desired size to unbind. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * + * \sa ::cuMulticastBindMem, ::cuMulticastBindAddr + */ +CUresult CUDAAPI cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size); + +/** +* \brief Calculates either the minimal or recommended granularity for multicast object +* +* Calculates either the minimal or recommended granularity for a given set of +* multicast object properties and returns it in granularity. This granularity +* can be used as a multiple for size, bind offsets and address mappings of the +* multicast object. +* +* \param[out] granularity Returned granularity. +* \param[in] prop Properties of the multicast object. +* \param[in] option Determines which granularity to return. +* +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind +*/ +CUresult CUDAAPI cuMulticastGetGranularity(size_t *granularity, const CUmulticastObjectProp *prop, CUmulticastGranularity_flags option); + +/** @} */ /* END CUDA_MULTICAST */ + +/** + * \defgroup CUDA_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDA_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cuDeviceGetAttribute() with the device + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. + * + * Unified addressing is automatically enabled in 64-bit processes + * + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cuPointerGetAttribute() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to the various copy functions in the + * CUDA API. The function ::cuMemcpy() may be used to perform a copy + * between two pointers, ignoring whether they point to host or device + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() + * unnecessary for devices supporting unified addressing). For + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be + * used to specify that the CUDA driver should infer the location of the + * pointer from its value. + * + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated in all contexts using ::cuMemAllocHost() and + * ::cuMemHostAlloc() is always directly accessible from all contexts on + * all devices that support unified addressing. This is the case regardless + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host, + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. + * + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory + * + * Upon enabling direct access from a context that supports unified addressing + * to another peer context that supports unified addressing using + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible + * by the current context. The device pointer value through + * which any peer memory may be accessed in the current context + * is the same pointer value through which that memory may be + * accessed in the peer context. + * + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cuMemHostRegister() and host memory + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all + * contexts that support unified addressing. + * + * This device address may be queried using ::cuMemHostGetDevicePointer() + * when a context using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * through ::cuMemcpy() and similar functions using the + * ::CU_MEMORYTYPE_UNIFIED memory type. + * + */ + +/** + * \brief Returns information about a pointer + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT: + * + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or + * registered. + * The type of \p data must be ::CUcontext *. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: + * + * Returns in \p *data the physical memory type of the memory that + * \p ptr addresses as a ::CUmemorytype enumerated value. + * The type of \p data must be unsigned int. + * + * If \p ptr addresses device memory then \p *data is set to + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the + * memory resides is the ::CUdevice of the ::CUcontext returned by the + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. + * + * If \p ptr addresses host memory then \p *data is set to + * ::CU_MEMORYTYPE_HOST. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If the current ::CUcontext does not support unified virtual + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: + * + * Returns in \p *data the device pointer value through which + * \p ptr may be accessed by kernels running in the current + * ::CUcontext. + * The type of \p data must be CUdeviceptr *. + * + * If there exists no device pointer value through which + * kernels running in the current ::CUcontext may access + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If there is no current ::CUcontext then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: + * + * Returns in \p *data the host pointer value through which + * \p ptr may be accessed by by the host program. + * The type of \p data must be void **. + * If there exists no host pointer value through which + * the host program may directly access \p ptr then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: + * + * Returns in \p *data two tokens for use with the nv-p2p.h Linux + * kernel interface. \p data must be a struct of type + * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. + * + * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). + * Note that p2pToken and vaSpaceToken are only valid for the + * lifetime of the source allocation. A subsequent allocation at + * the same address may return completely different tokens. + * Querying this attribute has a side effect of setting the attribute + * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that + * \p ptr points to. + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute which when set, ensures that synchronous memory operations + * initiated on the region of memory that \p ptr points to will always synchronize. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: + * + * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. + * \p data must point to an unsigned long long. + * + * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. + * Every memory allocation from any of the CUDA memory allocation APIs will + * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs + * from previous freed allocations. IDs are only unique within a single process. + * + * + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: + * + * Returns in \p *data a boolean that indicates whether the pointer points to + * managed memory or not. + * + * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: + * + * Returns in \p *data an integer representing a device ordinal of a device against + * which the memory was allocated or registered. + * + * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: + * + * Returns in \p *data a boolean that indicates if this pointer maps to + * an allocation that is suitable for ::cudaIpcGetMemHandle. + * + * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: + * + * Returns in \p *data the starting address for the allocation referenced + * by the device pointer \p ptr. Note that this is not necessarily the + * address of the mapped region, but the address of the mappable address + * range \p ptr references (e.g. from ::cuMemAddressReserve). + * + * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: + * + * Returns in \p *data the size for the allocation referenced by the device + * pointer \p ptr. Note that this is not necessarily the size of the mapped + * region, but the size of the mappable address range \p ptr references + * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped + * region, see ::cuMemGetAddressRange + * + * - ::CU_POINTER_ATTRIBUTE_MAPPED: + * + * Returns in \p *data a boolean that indicates if this pointer is in a + * valid address range that is mapped to a backing allocation. + * + * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: + * + * Returns a bitmask of the allowed handle types for an allocation that may + * be passed to ::cuMemExportToShareableHandle. + * + * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: + * + * Returns in \p *data the handle to the mempool that the allocation was obtained from. + * + * \par + * + * Note that for most allocations in the unified virtual address space + * the host and device pointer for accessing the allocation will be the + * same. The exceptions to this are + * - user memory registered using ::cuMemHostRegister + * - host memory allocated using ::cuMemHostAlloc with the + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag + * For these types of allocation there will exist separate, disjoint host + * and device addresses for accessing the allocation. In particular + * - The host address will correspond to an invalid unmapped device address + * (which will result in an exception if accessed from the device) + * - The device address will correspond to an invalid unmapped host address + * (which will result in an exception if accessed from the host). + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host + * and device addresses from either address. + * + * \param data - Returned pointer attribute value + * \param attribute - Pointer attribute to query + * \param ptr - Pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerSetAttribute, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); + +/** + * \brief Prefetches memory to the specified destination device + * + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the + * destination device. \p count specifies the number of bytes to copy. \p hStream + * is the stream in which the operation is enqueued. The memory range must refer + * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + * + * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If + * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * must be non-zero. Additionally, \p hStream must be associated with a device that has a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * + * The start address and end address of the memory range will be rounded down and rounded up + * respectively to be aligned to CPU page size before the prefetch operation is enqueued + * in the stream. + * + * If no physical memory has been allocated for this region, then this memory region + * will be populated and mapped on the destination device. If there's insufficient + * memory to prefetch the desired region, the Unified Memory driver may evict pages from other + * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. + * + * By default, any mappings to the previous location of the migrated pages are removed and + * mappings for the new location are only setup on \p dstDevice. The exact behavior however + * also depends on the settings applied to this memory range via ::cuMemAdvise as described + * below: + * + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + * then that subset will create a read-only copy of the pages on \p dstDevice. + * + * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the + * preferred location of any pages in the memory range. + * + * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + * then mappings to those pages from all the appropriate processors are updated to + * refer to the new location if establishing such a mapping is possible. Otherwise, + * those mappings are cleared. + * + * Note that this API is not required for functionality and only serves to improve performance + * by allowing the application to migrate data to a suitable location before it is accessed. + * Memory accesses to this range are always coherent and are allowed even when the data is + * actively being migrated. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param devPtr - Pointer to be prefetched + * \param count - Size in bytes + * \param dstDevice - Destination device to prefetch to + * \param hStream - Stream to enqueue prefetch operation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, + * ::cudaMemPrefetchAsync + */ +CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + +/** + * \brief Advise about the usage of a given memory range + * + * Advise the Unified Memory subsystem about the usage pattern for the memory range + * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory + * range will be rounded down and rounded up respectively to be aligned to CPU page size before the + * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged + * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable + * memory provided it represents a valid, host-accessible region of memory and all additional constraints + * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable + * memory range results in an error being returned. + * + * The \p advice parameter can take the following values: + * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + * from and only occasionally written to. Any read accesses from any processor to this region will create a + * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + * is called on this region, it will create a read-only copy of the data on the destination processor. + * If any processor writes to this region, all copies of the corresponding page will be invalidated + * except for the one where the write occurred. The \p device argument is ignored for this advice. + * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * Also, if a context is created on a device that does not have the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + * all such contexts are destroyed. + * If the memory region refers to valid system-allocated pageable memory, then the accessing device must + * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + * will not create a read-only copy when that device accesses this memory region. + * + * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated + * copies of the data will be collapsed into a single copy. The location for the collapsed + * copy will be the preferred location if the page has a preferred location and one of the read-duplicated + * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * + * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location + * does not cause data to migrate to that location immediately. Instead, it guides the migration policy + * when a fault occurs on that memory region. If the data is already in its preferred location and the + * faulting processor can establish a mapping without requiring the data to be migrated, then + * data migration will be avoided. On the other hand, if the data is not in its preferred location + * or if a direct mapping cannot be established, then it will be migrated to the processor accessing + * it. It is important to note that setting the preferred location does not prevent data prefetching + * done using ::cuMemPrefetchAsync. + * Having a preferred location can override the page thrash detection and resolution logic in the Unified + * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device + * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But + * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice, unless read accesses from + * \p device will not result in a read-only copy being created on that device as outlined in description for + * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. Note however that this behavior may change in the future. + * + * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + * and changes the preferred location to none. + * + * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then + * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + * This advice does not cause data migration and has no impact on the location of the data per se. Instead, + * it causes the data to always be mapped in the specified processor's page tables, as long as the + * location of the data permits a mapping to be established. If the data gets migrated for any reason, + * the mappings are updated accordingly. + * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data + * over to the other GPUs is not as important because the accesses are infrequent and the overhead of + * migration may be too high. But preventing faults can still help improve performance, and so having + * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the + * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + * page in host memory. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. Additionally, if the + * preferred location of this memory region or any subset of it is also \p device, then the policies + * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * + * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to + * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * + * \param devPtr - Pointer to memory to set the advice for + * \param count - Size in bytes of the memory range + * \param advice - Advice to be applied for the specified memory range + * \param device - Device to apply the advice for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, + * ::cudaMemAdvise + */ +CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); + +/** + * \brief Query an attribute of a given memory range + * + * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. + * + * The \p attribute parameter can take the following values: + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted + * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given + * memory range have read-duplication enabled, or 0 otherwise. + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device + * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU + * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID + * if either all the pages don't have the same preferred location or some of the pages don't have a + * preferred location at all. Note that the actual location of the pages in the memory range at the time of + * the query may be different from the preferred location. + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted + * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned + * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. + * If any device does not have that advice set for the entire memory range, that device will not be included. + * If \p data is larger than the number of devices that have that advice set for that memory range, + * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 + * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be + * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have + * that advice set, then only as many devices will be returned as can fit in the array. There is no + * guarantee on which specific devices will be returned, however. + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location + * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be + * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU + * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not + * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the + * last location that the application requested to prefetch the memory range to. It gives no indication as to + * whether the prefetch operation to that location has completed or even begun. + * + * \param data - A pointers to a memory location where the result + * of each attribute query will be written to. + * \param dataSize - Array containing the size of data + * \param attribute - The attribute to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, + * ::cuMemAdvise, + * ::cudaMemRangeGetAttribute + */ +CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); + +/** + * \brief Query attributes of a given memory range. + * + * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes + * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. + * The results of the query will be stored in \p data. + * + * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for + * attribute descriptions and restrictions. + * + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION + * + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param dataSizes - Array containing the sizes of each result + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param numAttributes - Number of attributes to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, + * ::cuMemPrefetchAsync, + * ::cudaMemRangeGetAttributes + */ +CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); + +/** + * \brief Set attributes on a previously allocated memory region + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute that can either be set (1) or unset (0). When set, + * the region of memory that \p ptr points to is guaranteed to always synchronize + * memory operations that are synchronous. If there are some previously initiated + * synchronous memory operations that are pending when this attribute is set, the + * function does not return until those memory operations are complete. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. + * + * \param value - Pointer to memory containing the value to be set + * \param attribute - Pointer attribute to set + * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuPointerGetAttribute, + * ::cuPointerGetAttributes, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister + */ +CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); + +/** + * \brief Returns information about a pointer. + * + * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR + * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE + * - ::CU_POINTER_ATTRIBUTE_MAPPED + * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE + * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES + * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE + * + * \param numAttributes - Number of attributes to query + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param ptr - Pointer to query + * + * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr + * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values + * and CUDA_SUCCESS is returned. + * + * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA + * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerGetAttribute, + * ::cuPointerSetAttribute, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); + +/** @} */ /* END CUDA_UNIFIED */ + +/** + * \defgroup CUDA_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Create a stream + * + * Creates a stream and returns a handle in \p phStream. The \p Flags argument + * determines behaviors of the stream. + * + * Valid values for \p Flags are: + * - ::CU_STREAM_DEFAULT: Default stream creation flag. + * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param phStream - Returned newly created stream + * \param Flags - Parameters for stream creation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); + +/** + * \brief Create a stream with the given priority + * + * Creates a stream with the specified priority and returns a handle in \p phStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param phStream - Returned newly created stream + * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of + * valid flags + * \param priority - Stream priority. Lower numbers represent higher priorities. + * See ::cuCtxGetStreamPriorityRange for more information about + * meaningful stream priorities that can be passed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \note Stream priorities are supported only on GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreateWithPriority + */ +CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); + + +/** + * \brief Query the priority of a given stream + * + * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the priority in \p priority. Note that if the stream was created with a + * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cuStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamCreateWithPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cudaStreamGetPriority + */ +CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + +/** + * \brief Query the flags of a given stream + * + * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the flags in \p flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * The value returned in \p flags is a logical 'OR' of all flags that + * were used while creating this stream. See ::cuStreamCreate for the list + * of valid flags + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cudaStreamGetFlags + */ +CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + +/** + * \brief Returns the unique Id associated with the stream handle supplied + * + * Returns in \p streamId the unique Id which is associated with the given stream handle. + * The Id is unique for the life of the program. + * + * The stream handle \p hStream can refer to any of the following: + *
    + *
  • a stream created via any of the CUDA driver APIs such as ::cuStreamCreate + * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as + * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + * Passing an invalid handle will result in undefined behavior.
  • + *
  • any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, + * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
  • + *
+ * + * \param hStream - Handle to the stream to be queried + * \param streamId - Pointer to store the Id of the stream + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cudaStreamGetId + */ +CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId); + +/** + * \brief Query the context associated with a stream + * + * Returns the CUDA context that the stream is associated with. + * + * The stream handle \p hStream can refer to any of the following: + *
    + *
  • a stream created via any of the CUDA driver APIs such as ::cuStreamCreate + * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as + * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + * The returned context is the context that was active in the calling thread when the + * stream was created. Passing an invalid handle will result in undefined behavior.
  • + *
  • any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, + * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. + * Specifying any of the special handles will return the context current to the + * calling thread. If no context is current to the calling thread, + * ::CUDA_ERROR_INVALID_CONTEXT is returned.
  • + *
+ * + * \param hStream - Handle to the stream to be queried + * \param pctx - Returned context associated with the stream + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p hStream wait for all work captured in + * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. + * The synchronization will be performed efficiently on the device when applicable. + * \p hEvent may be from a different context or device than \p hStream. + * + * flags include: + * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external + * event node when performing stream capture. This flag is invalid outside + * of stream capture. + * + * \param hStream - Stream to wait + * \param hEvent - Event to wait on (may not be NULL) + * \param Flags - See ::CUevent_capture_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuEventRecord, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cuStreamDestroy, + * ::cudaStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + +/** + * \brief Add a callback to a compute stream + * + * \note This function is slated for eventual deprecation and removal. If + * you do not require the callback to execute in case of a device error, + * consider using ::cuLaunchHostFunc. Additionally, this function is not + * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike + * ::cuLaunchHostFunc. + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cuStreamAddCallback call, the callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::CUDA_SUCCESS or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::CUresult. + * + * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + *
    + *
  • The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.
  • + *
  • The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * callback with an event.
  • + *
  • Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.
  • + *
+ * + * \param hStream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuLaunchHostFunc, + * ::cudaStreamAddCallback + */ +CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + +/** + * \brief Begins graph capture on a stream + * + * Begin graph capture on \p hStream. When a stream is in capture mode, all operations + * pushed into the stream will not be executed, but will instead be captured into + * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated + * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which + * it was initiated, and it may only be initiated if the stream is not already in capture + * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id + * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. + * + * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be + * called on this stream from the same thread. + * + * \param hStream - Stream in which to initiate capture + * \param mode - Controls the interaction of this capture sequence with other API + * calls that are potentially unsafe. For more details see + * ::cuThreadExchangeStreamCaptureMode. + * + * \note Kernels captured using this API must not use texture and surface references. + * Reading or writing through any texture or surface reference is undefined + * behavior. This restriction does not apply to texture and surface objects. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamIsCapturing, + * ::cuStreamEndCapture, + * ::cuThreadExchangeStreamCaptureMode + */ +CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); + +/** + * \brief Swaps the stream capture interaction mode for a thread + * + * Sets the calling thread's stream capture interaction mode to the value contained + * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To + * facilitate deterministic behavior across function or module boundaries, callers + * are encouraged to use this API in a push-pop fashion: \code + CUstreamCaptureMode mode = desiredMode; + cuThreadExchangeStreamCaptureMode(&mode); + ... + cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode + * \endcode + * + * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call + * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is + * not enqueued asynchronously to a stream, and is not observed by stream capture. + * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture + * depended on the allocation being replayed whenever the graph is launched, the + * captured graph would be invalid. + * + * Therefore, stream capture places restrictions on API calls that can be made within + * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This + * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. + * + * A thread's mode is one of the following: + * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has + * an ongoing capture sequence that was not initiated with + * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread + * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, + * this thread is prohibited from potentially unsafe API calls. + * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture + * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited + * from potentially unsafe API calls. Concurrent capture sequences in other threads + * are ignored. + * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially + * unsafe API calls. Note that the thread is still prohibited from API calls which + * necessarily conflict with stream capture, for example, attempting ::cuEventQuery + * on an event that was last recorded inside a capture sequence. + * + * \param mode - Pointer to mode value to swap with the current mode + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamBeginCapture + */ +CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); + +/** + * \brief Ends capture on a stream, returning the captured graph + * + * End capture on \p hStream, returning the captured graph via \p phGraph. + * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. + * If capture was invalidated, due to a violation of the rules of stream capture, then + * a NULL graph will be returned. + * + * If the \p mode argument to ::cuStreamBeginCapture was not + * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as + * ::cuStreamBeginCapture. + * + * \param hStream - Stream to query + * \param phGraph - The captured graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ +CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + +/** + * \brief Returns a stream's capture status + * + * Return the capture status of \p hStream via \p captureStatus. After a successful + * call, \p *captureStatus will contain one of the following: + * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. + * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. + * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error + * has invalidated the capture sequence. The capture sequence must be terminated + * with ::cuStreamEndCapture on the stream where it was initiated in order to + * continue using \p hStream. + * + * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while + * a blocking stream in the same context is capturing, it will return + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified + * after the call. The blocking stream capture is not invalidated. + * + * When a blocking stream is capturing, the legacy stream is in an + * unusable state until the blocking stream capture is terminated. The legacy + * stream is not supported for stream capture, but attempted use would have an + * implicit dependency on the capturing stream(s). + * + * \param hStream - Stream to query + * \param captureStatus - Returns the stream's capture status + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamEndCapture + */ +CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + +/** + * \brief Query a stream's capture state + * + * Query stream state related to stream capture. + * + * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created + * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + * + * Valid data (other than capture status) is returned only if both of the following are true: + * - the call returns CUDA_SUCCESS + * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE + * + * \param hStream - The stream to query + * \param captureStatus_out - Location to return the capture status of the stream; required + * \param id_out - Optional location to return an id for the capture sequence, which is + * unique over the lifetime of the process + * \param graph_out - Optional location to return the graph being captured into. All + * operations other than destroy and node removal are permitted on the graph + * while the capture sequence is in progress. This API does not transfer + * ownership of the graph, which is transferred or destroyed at + * ::cuStreamEndCapture. Note that the graph handle may be invalidated before + * end of capture for certain errors. Nodes that are or become + * unreachable from the original stream at ::cuStreamEndCapture due to direct + * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. + * \param dependencies_out - Optional location to store a pointer to an array of nodes. + * The next node to be captured in the stream will depend on this set of nodes, + * absent operations such as event wait which modify this set. The array pointer + * is valid until the next API call which operates on the stream or until end of + * capture. The node handles may be copied out and are valid until they or the + * graph is destroyed. The driver-owned array may also be passed directly to + * APIs that operate on the graph (not the stream) without copying. + * \param numDependencies_out - Optional location to store the size of the array + * returned in dependencies_out. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing, + * ::cuStreamUpdateCaptureDependencies + */ +CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, + cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); + +/** + * \brief Update the set of dependencies in a capturing stream (11.3+) + * + * Modifies the dependency set of a capturing stream. The dependency set is the set + * of nodes that the next captured node in the stream will depend on. + * + * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and + * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to + * the API is added to the existing set or replaces it. A flags value of 0 defaults + * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. + * + * Nodes that are removed from the dependency set via this API do not result in + * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at + * ::cuStreamEndCapture. + * + * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. + * + * This API is new in CUDA 11.3. Developers requiring compatibility across minor + * versions to CUDA 11.0 should not use this API or provide a fallback. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_ILLEGAL_STATE + * + * \sa + * ::cuStreamBeginCapture, + * ::cuStreamGetCaptureInfo, + */ +CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p hStream to specify stream association of + * \p length bytes of memory starting from \p dptr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p dptr must point to one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cuMemAllocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable host allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::CUmemAttach_flags. + * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with + * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + * the program makes a guarantee that it will only access the memory on the device + * from \p hStream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p hStream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p hStream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cuMemAllocManaged. For __managed__ variables, the default + * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param hStream - Stream in which to enqueue the attach operation + * \param dptr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * pageable memory) + * \param length - Length of memory + * \param flags - Must be one of ::CUmemAttach_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cudaStreamAttachMemAsync + */ +CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + +/** + * \brief Determine status of a compute stream + * + * Returns ::CUDA_SUCCESS if all operations in the stream specified by + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuStreamSynchronize(). + * + * \param hStream - Stream to query status of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamQuery + */ +CUresult CUDAAPI cuStreamQuery(CUstream hStream); + +/** + * \brief Wait until a stream's tasks are completed + * + * Waits until the device has completed all operations in the stream specified + * by \p hStream. If the context was created with the + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the + * stream is finished with all of its tasks. + * + * \param hStream - Stream to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamDestroy, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamAddCallback, + * ::cudaStreamSynchronize + */ +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + +/** + * \brief Destroys a stream + * + * Destroys the stream specified by \p hStream. + * + * In case the device is still doing work in the stream \p hStream + * when ::cuStreamDestroy() is called, the function will return immediately + * and the resources associated with \p hStream will be released automatically + * once the device has completed all work in \p hStream. + * + * \param hStream - Stream to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamDestroy + */ +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); + +/** + * \brief Copies attributes from source stream to destination stream. + * + * Copies attributes from source stream \p src to destination stream \p dst. + * Both streams must have the same context. + * + * \param[out] dst Destination stream + * \param[in] src Source stream + * For list of attributes see ::CUstreamAttrID + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); + +/** + * \brief Queries stream attribute. + * + * Queries attribute \p attr from \p hStream and stores it in corresponding + * member of \p value_out. + * + * \param[in] hStream + * \param[in] attr + * \param[out] value_out + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, + CUstreamAttrValue *value_out); + +/** + * \brief Sets stream attribute. + * + * Sets attribute \p attr on \p hStream from corresponding attribute of + * \p value. The updated attribute will be applied to subsequent work + * submitted to the stream. It will not affect previously submitted work. + * + * \param[out] hStream + * \param[in] attr + * \param[in] value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, + const CUstreamAttrValue *value); + +/** @} */ /* END CUDA_STREAM */ + + +/** + * \defgroup CUDA_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event + * + * Creates an event *phEvent for the current context with the flags specified via + * \p Flags. Valid flags include: + * - ::CU_EVENT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on + * an event created with this flag will block until the event has actually + * been recorded. + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). + * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an + * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must + * be specified along with ::CU_EVENT_DISABLE_TIMING. + * + * \param phEvent - Returns newly created event + * \param Flags - Event creation flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventCreate, + * ::cudaEventCreateWithFlags + */ +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); + +/** + * \brief Records an event + * + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. + * + * ::cuEventRecord() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventRecord, + * ::cuEventRecordWithFlags + */ +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + +/** + * \brief Records an event + * + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. + * + * ::cuEventRecordWithFlags() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. + * + * flags include: + * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. + * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external + * event node when performing stream capture. This flag is invalid outside + * of stream capture. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * \param flags - See ::CUevent_capture_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cuEventRecord, + * ::cudaEventRecord + */ +CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); + +/** + * \brief Queries an event's status + * + * Queries the status of all work currently captured by \p hEvent. See + * ::cuEventRecord() for details on what is captured by an event. + * + * Returns ::CUDA_SUCCESS if all captured work has been completed, or + * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuEventSynchronize(). + * + * \param hEvent - Event to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventQuery + */ +CUresult CUDAAPI cuEventQuery(CUevent hEvent); + +/** + * \brief Waits for an event to complete + * + * Waits until the completion of all work currently captured in \p hEvent. + * See ::cuEventRecord() for details on what is captured by an event. + * + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param hEvent - Event to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventSynchronize + */ +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); + +/** + * \brief Destroys an event + * + * Destroys the event specified by \p hEvent. + * + * An event may be destroyed before it is complete (i.e., while + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the + * call does not block on completion of the event, and any associated + * resources will automatically be released asynchronously at completion. + * + * \param hEvent - Event to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventElapsedTime, + * ::cudaEventDestroy + */ +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); + +/** + * \brief Computes the elapsed time between two events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cuEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cuEventRecord() has not been called on either event then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called + * on both events but one or both of them has not yet been completed (that is, + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms + * \param hStart - Starting event + * \param hEnd - Ending event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cudaEventElapsedTime + */ +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + +/** @} */ /* END CUDA_EVENT */ + +/** + * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability + * + * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the external resource interoperability functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + + /** + * \brief Imports an external memory object + * + * Imports an externally allocated memory object and returns + * a handle to that in \p extMem_out. + * + * The properties of the handle being imported must be described in + * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure + * is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + CUexternalMemoryHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + const void *nvSciBufObject; + } handle; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type + * of handle being imported. ::CUexternalMemoryHandleType is + * defined as: + * + * \code + typedef enum CUexternalMemoryHandleType_enum { + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 + } CUexternalMemoryHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a memory object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a memory object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a memory object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * memory object are destroyed. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Heap object. This handle holds a reference to the underlying + * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Heap object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Resource object. This handle holds a reference to the + * underlying object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Resource object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * represent a valid shared NT handle that is returned by + * IDXGIResource1::CreateSharedHandle when referring to a + * ID3D11Resource object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D11Resource object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * represent a valid shared KMT handle that is returned by + * IDXGIResource::GetSharedHandle when referring to a + * ID3D11Resource object and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL + * and reference a valid NvSciBuf object. + * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the + * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync + * as appropriate barriers to maintain coherence between CUDA and the other drivers. + * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC + * for memory synchronization. + * + * + * The size of the memory object must be specified in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. + * + * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the + * resource is a dedicated resource. The definition of what a + * dedicated resource is outside the scope of this extension. + * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type + * is one of the following: + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT + * + * \param extMem_out - Returned handle to an external memory object + * \param memHandleDesc - Memory import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OPERATING_SYSTEM + * \notefnerr + * + * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the + * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges + * as well as appropriate Vulkan pipeline barriers to maintain coherence between + * CPU and GPU. For more information on these APIs, please refer to "Synchronization + * and Cache Control" chapter from Vulkan specification. + * + * \sa ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); + +/** + * \brief Maps a buffer onto an imported memory object + * + * Maps a buffer onto an imported memory object and returns a device + * pointer in \p devPtr. + * + * The properties of the buffer being mapped must be described in + * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + unsigned long long offset; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in + * the memory object where the buffer's base address is. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. + * + * The offset and size have to be suitably aligned to match the + * requirements of the external API. Mapping two buffers whose ranges + * overlap may or may not result in the same virtual address being + * returned for the overlapped portion. In such cases, the application + * must ensure that all accesses to that region from the GPU are + * volatile. Otherwise writes made via one address are not guaranteed + * to be visible via the other address, even if they're issued by the + * same thread. It is recommended that applications map the combined + * range instead of mapping separate buffers and then apply the + * appropriate offsets to the returned pointer to derive the + * individual buffers. + * + * The returned pointer \p devPtr must be freed using ::cuMemFree. + * + * \param devPtr - Returned device pointer to buffer + * \param extMem - Handle to external memory object + * \param bufferDesc - Buffer descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory, + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); + +/** + * \brief Maps a CUDA mipmapped array onto an external memory object + * + * Maps a CUDA mipmapped array onto an external object and returns a + * handle to it in \p mipmap. + * + * The properties of the CUDA mipmapped array being mapped must be + * described in \p mipmapDesc. The structure + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + unsigned long long offset; + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + unsigned int numLevels; + } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the + * offset in the memory object where the base level of the mipmap + * chain is. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes + * the format, dimensions and type of the base level of the mipmap + * chain. For further details on these parameters, please refer to the + * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped + * array is bound as a color target in the graphics API, then the flag + * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies + * the total number of levels in the mipmap chain. + * + * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. + * + * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. + * + * \param mipmap - Returned CUDA mipmapped array + * \param extMem - Handle to external memory object + * \param mipmapDesc - CUDA array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory, + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer + */ +CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); + +/** + * \brief Destroys an external memory object. + * + * Destroys the specified external memory object. Any existing buffers + * and CUDA mipmapped arrays mapped onto this object must no longer be + * used and must be explicitly freed using ::cuMemFree and + * ::cuMipmappedArrayDestroy respectively. + * + * \param extMem - External memory object to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory, + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); + +/** + * \brief Imports an external semaphore + * + * Imports an externally allocated synchronization object and returns + * a handle to that in \p extSem_out. + * + * The properties of the handle being imported must be described in + * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + CUexternalSemaphoreHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + const void* NvSciSyncObj; + } handle; + unsigned int flags; + } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of + * handle being imported. ::CUexternalSemaphoreHandleType is defined + * as: + * + * \code + typedef enum CUexternalSemaphoreHandleType_enum { + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 + } CUexternalSemaphoreHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * synchronization object are destroyed. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Fence object. This handle holds a reference to the underlying + * object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D12Fence object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared NT handle that is returned by + * ID3D11Fence::CreateSharedHandle. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D11Fence object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj + * represents a valid NvSciSyncObj. + * + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared NT handle that + * is returned by IDXGIResource1::CreateSharedHandle when referring to + * a IDXGIKeyedMutex object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid IDXGIKeyedMutex object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared KMT handle that + * is returned by IDXGIResource::GetSharedHandle when referring to + * a IDXGIKeyedMutex object and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * + * \param extSem_out - Returned handle to an external semaphore + * \param semHandleDesc - Semaphore import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OPERATING_SYSTEM + * \notefnerr + * + * \sa ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); + +/** + * \brief Signals a set of external semaphore objects + * + * Enqueues a signal operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of signaling a semaphore depends on the type of + * the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then signaling the semaphore will set it to the signaled state. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * then the semaphore will be set to the value specified in + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. + * + * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC + * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence + * to a value that can be used by subsequent waiters of the same NvSciSync object + * to order operations with those currently submitted in \p stream. Such an update + * will overwrite previous contents of + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, + * signaling such an external semaphore object causes appropriate memory synchronization + * operations to be performed over all external memory objects that are imported as + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses + * made by other importers of the same set of NvSciBuf memory object(s) are coherent. + * These operations can be skipped by specifying the flag + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return + * CUDA_ERROR_NOT_SUPPORTED. + * NvSciSyncFence associated with semaphore object of the type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC can be deterministic. For this the + * NvSciSyncAttrList used to create the semaphore object must have value of + * NvSciSyncAttrKey_RequireDeterministicFences key set to true. Deterministic fences + * allow users to enqueue a wait over the semaphore object even before corresponding + * signal is enqueued. For such a semaphore object, CUDA guarantees that each signal + * operation will increment the fence value by '1'. Users are expected to track count + * of signals enqueued on the semaphore object and insert waits accordingly. When such + * a semaphore object is signaled from multiple streams, due to concurrent stream + * execution, it is possible that the order in which the semaphore gets signaled is + * indeterministic. This could lead to waiters of the semaphore getting unblocked + * incorrectly. Users are expected to handle such situations, either by not using the + * same semaphore object with deterministic fence support enabled in different streams + * or by adding explicit dependency amongst such streams so that the semaphore is + * signaled in order. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then the keyed mutex will be released with the key specified in + * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. + * + * \param extSemArray - Set of external semaphores to be signaled + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to signal + * \param stream - Stream to enqueue the signal operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Waits on a set of external semaphore objects + * + * Enqueues a wait operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of waiting on a semaphore depends on the type + * of the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then waiting on the semaphore will wait until the semaphore reaches + * the signaled state. The semaphore will then be reset to the + * unsignaled state. Therefore for every signal operation, there can + * only be one wait operation. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * then waiting on the semaphore will wait until the value of the + * semaphore is greater than or equal to + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. + * + * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC + * then, waiting on the semaphore will wait until the + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the + * signaler of the NvSciSyncObj that was associated with this semaphore object. + * By default, waiting on such an external semaphore object causes appropriate + * memory synchronization operations to be performed over all external memory objects + * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that + * any subsequent accesses made by other importers of the same set of NvSciBuf memory + * object(s) are coherent. These operations can be skipped by specifying the flag + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return + * CUDA_ERROR_NOT_SUPPORTED. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then the keyed mutex will be acquired when it is released with the key + * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key + * or until the timeout specified by + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs + * has lapsed. The timeout interval can either be a finite value + * specified in milliseconds or an infinite value. In case an infinite + * value is specified the timeout never elapses. The windows INFINITE + * macro must be used to specify infinite timeout. + * + * \param extSemArray - External semaphores to be waited on + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to wait on + * \param stream - Stream to enqueue the wait operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_TIMEOUT + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync + */ +CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Destroys an external semaphore + * + * Destroys an external semaphore object and releases any references + * to the underlying resource. Any outstanding signals or waits must + * have completed before the semaphore is destroyed. + * + * \param extSem - External semaphore to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); + +/** @} */ /* END CUDA_EXTRES_INTEROP */ + +/** + * \defgroup CUDA_MEMOP Stream Memory Operations + * + * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream memory operations of the low-level CUDA + * driver application programming interface. + * + * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2. + * + * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() + * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and + * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and + * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform + * hardware features and can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. + * + * Note that all memory pointers passed as parameters to these operations + * are device pointers. Where necessary a device pointer should be + * obtained, for example with ::cuMemHostGetDevicePointer(). + * + * None of the operations accepts pointers to managed memory buffers + * (::cuMemAllocManaged). + * + * \note + * Warning: + * Improper use of these APIs may deadlock the application. Synchronization + * ordering established through these APIs is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by these APIs should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. + * + * @{ + */ + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue64, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue32, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Batch operations to synchronize the stream via memory operations + * + * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). + * Batching operations may avoid some performance overhead in both the API call + * and the device execution versus adding them to the stream in separate API + * calls. The operations are enqueued in the order they appear in the array. + * + * See ::CUstreamBatchMemOpType for the full set of supported operations, and + * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), + * and ::cuStreamWriteValue64() for details of specific operations. + * + * See related APIs for details on querying support for specific operations. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to enqueue the operations in. + * \param count The number of operations in the array. Must be less than 256. + * \param paramArray The types and parameters of the individual operations. + * \param flags Reserved for future expansion; must be 0. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuMemHostRegister + */ +CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + +/** @} */ /* END CUDA_MEMOP */ + +/** + * \defgroup CUDA_EXEC Execution Control + * + * ___MANBRIEF___ execution control functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns information about a function + * + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel + * given by \p hfunc. The supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + * per block, beyond which a launch of the function would fail. This number + * depends on both the function and the device on which the function is + * currently loaded. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory per block required by this function. + * This does not include dynamically-allocated shared memory requested by + * the user at runtime. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + * constant memory required by this function. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + * used by each thread of this function. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + * of this function. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + * which the function was compiled. This value is the major PTX version * 10 + * + the minor PTX version, so a PTX version 1.3 function would return the + * value 13. Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + * which the function was compiled. This value is the major binary + * version * 10 + the minor binary version, so a binary version 1.3 function + * would return the value 13. Note that this will return a value of 10 for + * legacy cubins that do not have a properly-encoded binary architecture + * version. + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has + * been compiled with user specified option "-Xptxas --dlcm=ca" set . + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of + * dynamically-allocated shared memory. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + * cache split ratio in percent of total shared memory. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the + * kernel must launch with a valid cluster size specified. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether + * the function can be launched with non-portable cluster size. 1 is allowed, + * 0 is disallowed. A non-portable cluster size may only function on the + * specific SKUs the program is tested on. The launch might fail if the + * program is run on a different hardware platform. CUDA API provides + * cudaOccupancyMaxActiveClusters to assist with checking whether the desired + * size can be launched on the current device. A portable cluster size is + * guaranteed to be functional on all compute capabilities higher than the + * target compute capability. The portable cluster size for sm_90 is 8 blocks + * per cluster. This value may increase for future compute capabilities. The + * specific hardware unit may support higher cluster sizes that’s not + * guaranteed to be portable. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + * scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + * + * \param pi - Returned attribute value + * \param attrib - Attribute requested + * \param hfunc - Function to query attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes, + * ::cudaFuncSetAttribute, + * ::cuKernelGetAttribute + */ +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); + +/** + * \brief Sets information about a function + * + * This call sets the value of a specified attribute \p attrib on the kernel given + * by \p hfunc to an integer value specified by \p val + * This function returns CUDA_SUCCESS if the new value of the attribute could be + * successfully set. If the set fails, this call will return an error. + * Not all attributes can have values set. Attempting to set a value on a read-only + * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + * + * Supported attributes for the cuFuncSetAttribute call are: + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of + * dynamically-allocated shared memory. The value should contain the requested + * maximum size of dynamically-allocated shared memory. The sum of this value and + * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + * The maximal size of requestable dynamic shared memory may differ by GPU + * architecture. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + * cache and shared memory use the same hardware resources, this sets the shared memory + * carveout preference, in percent of the total shared memory. + * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + * scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + * + * \param hfunc - Function to query attribute of + * \param attrib - Attribute requested + * \param value - The value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes, + * ::cudaFuncSetAttribute, + * ::cuKernelSetAttribute + */ +CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the device function \p hfunc. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute \p hfunc. Any context-wide preference + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In + * that case, the current context-wide setting will be used. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param hfunc - Kernel to configure cache for + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetCacheConfig, + * ::cuKernelSetCacheConfig + */ +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); + +/** + * \brief Sets the shared memory configuration for a device function. + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cuFuncSetSharedMemConfig will override the context wide setting set with + * ::cuCtxSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory + * configuration when launching this function. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes when launching this function. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes when launching this function. + * + * \param hfunc - kernel to be given a shared memory config + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxSetSharedMemConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetSharedMemConfig + */ +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); + +/** + * \brief Returns a module handle + * + * Returns in \p *hmod the handle of the module that function \p hfunc + * is located in. The lifetime of the module corresponds to the lifetime of + * the context it was loaded in or until the module is explicitly unloaded. + * + * The CUDA runtime manages its own modules loaded into the primary context. + * If the handle returned by this API refers to a module loaded by the CUDA runtime, + * calling ::cuModuleUnload() on that module will result in undefined behavior. + * + * \param hmod - Returned module handle + * \param hfunc - Function to retrieve module for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + */ +CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); + +/** + * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel + * + * Invokes the function ::CUfunction or the kernel ::CUkernel \p f + * on a \p gridDimX x \p gridDimY x \p gridDimZ grid of blocks. + * Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p f can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into + * a single buffer that is passed in via the \p extra parameter. + * This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. Here is + * an example of using the \p extra parameter in this manner: + * \code + size_t argBufferSize; + char argBuffer[256]; + + // populate argBuffer and argBufferSize + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, + CU_LAUNCH_PARAM_END + }; + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); + * \endcode + * + * The \p extra parameter exists to allow ::cuLaunchKernel to take + * additional less commonly used arguments. \p extra specifies a list of + * names of extra settings and their corresponding values. Each extra + * setting name is immediately followed by the corresponding value. The + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer containing all + * the kernel parameters for launching kernel \p f; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t containing the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel + * parameters are specified with both \p kernelParams and \p extra + * (i.e. both \p kernelParams and \p extra are non-NULL). + * + * Calling ::cuLaunchKernel() invalidates the persistent function state + * set through the following deprecated APIs: + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), + * ::cuParamSetv(). + * + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * Note that the API can also be used to launch context-less kernel ::CUkernel + * by querying the handle using ::cuLibraryGetKernel() and then passing it + * to the API by casting to ::CUfunction. Here, the context to launch + * the kernel on will either be taken from the specified stream \p hStream + * or the current context in case of NULL stream. + * + * \param f - Function ::CUfunction or Kernel ::CUkernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_NOT_FOUND + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cudaLaunchKernel, + * ::cuLibraryGetKernel, + * ::cuKernelSetCacheConfig, + * ::cuKernelGetAttribute, + * ::cuKernelSetAttribute + */ +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); + +/** + * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel with launch-time configuration + * + * Invokes the function ::CUfunction or the kernel ::CUkernel \p f with the specified launch-time configuration + * \p config. + * + * The ::CUlaunchConfig structure is defined as: + * \code + typedef struct CUlaunchConfig_st { + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + CUlaunchAttribute *attrs; + unsigned int numAttrs; + } CUlaunchConfig; + * \endcode + * where: + * - ::CUlaunchConfig::gridDimX is the width of the grid in blocks. + * - ::CUlaunchConfig::gridDimY is the height of the grid in blocks. + * - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks. + * - ::CUlaunchConfig::blockDimX is the X dimension of each thread block. + * - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block. + * - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block. + * - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per + * thread block in bytes. + * - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch + * in. The CUDA context associated with this stream must match that associated + * with function f. + * - ::CUlaunchConfig::attrs is an array of ::CUlaunchConfig::numAttrs + * continguous ::CUlaunchAttribute elements. The value of this pointer is not + * considered if ::CUlaunchConfig::numAttrs is zero. However, in that case, it + * is recommended to set the pointer to NULL. + * - ::CUlaunchConfig::numAttrs is the numbers of attributes populating the + * first ::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs + * array. + * + * Launch-time configuration is specified by adding entries to + * ::CUlaunchConfig::attrs. Each entry is an attribute ID and a corresponding + * attribute value. + * + * The ::CUlaunchAttribute structure is defined as: + * \code + typedef struct CUlaunchAttribute_st { + CUlaunchAttributeID id; + CUlaunchAttributeValue value; + } CUlaunchAttribute; + * \endcode + * where: + * - ::CUlaunchAttribute::id is a unique enum identifying the attribute. + * - ::CUlaunchAttribute::value is a union that hold the attribute value. + * + * An example of using the \p config parameter: + * \code + CUlaunchAttribute coopAttr = {.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, + .value = 1}; + CUlaunchConfig config = {... // set block and grid dimensions + .attrs = &coopAttr, + .numAttrs = 1}; + + cuLaunchKernelEx(&config, kernel, NULL, NULL); + * \endcode + * + * The ::CUlaunchAttributeID enum is defined as: + * \code + typedef enum CUlaunchAttributeID_enum { + CU_LAUNCH_ATTRIBUTE_IGNORE = 0, + CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, + CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2, + CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3, + CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4, + CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5, + CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6, + CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7, + } CUlaunchAttributeID; + * \endcode + * + * and the corresponding ::CUlaunchAttributeValue union as : + * \code + typedef union CUlaunchAttributeValue_union { + cuuint64_t pad[8]; + CUaccessPolicyWindow accessPolicyWindow; + int cooperative; + CUsynchronizationPolicy syncPolicy; + struct { + unsigned int x; + unsigned int y; + unsigned int z; + } clusterDim; + CUclusterSchedulingPolicy clusterSchedulingPolicyPreference; + int programmaticStreamSerializationAllowed; + struct { + CUevent event; + int flags; + int triggerAtBlockStart; + } programmaticEvent; + } CUlaunchAttributeValue; + * \endcode + * + * Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the + * kernel launch to be a cooperative launch, with exactly the same usage and + * semantics of ::cuLaunchCooperativeKernel. + * + * Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero + * values causes the kernel to use programmatic means to resolve its stream + * dependency -- enabling the CUDA runtime to opportunistically allow the grid's + * execution to overlap with the previous kernel in the stream, if that kernel + * requests the overlap. + * + * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the + * kernel launch. Event recorded through this launch attribute is guaranteed to + * only trigger after all block in the associated kernel trigger the event. A + * block can trigger the event through PTX launchdep.release or CUDA builtin + * function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be + * inserted at the beginning of each block's execution if triggerAtBlockStart is + * set to non-0. Note that dependents (including the CPU thread calling + * cuEventSynchronize()) are not guaranteed to observe the release precisely + * when it is released. For example, cuEventSynchronize() may only observe the + * event trigger long after the associated kernel has completed. This recording + * type is primarily meant for establishing programmatic dependency between + * device tasks. The event supplied must not be an interprocess or interop + * event. The event must disable timing (i.e. created with + * ::CU_EVENT_DISABLE_TIMING flag set). + * + * The effect of other attributes is consistent with their effect when set via + * persistent APIs. + * + * See ::cuStreamSetAttribute for + * - ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW + * - ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY + * + * See ::cuFunctionSetAttribute for + * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION + * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE + * + * Kernel parameters to \p f can be specified in the same ways that they can be + * using ::cuLaunchKernel. + * + * Note that the API can also be used to launch context-less kernel ::CUkernel + * by querying the handle using ::cuLibraryGetKernel() and then passing it + * to the API by casting to ::CUfunction. Here, the context to launch + * the kernel on will either be taken from the specified stream ::CUlaunchConfig::hStream + * or the current context in case of NULL stream. + * + * \param config - Config to launch + * \param f - Function ::CUfunction or Kernel ::CUkernel to launch + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_NOT_FOUND + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cudaLaunchKernel, + * ::cudaLaunchKernelEx, + * ::cuLibraryGetKernel, + * ::cuKernelSetCacheConfig, + * ::cuKernelGetAttribute, + * ::cuKernelSetAttribute + */ +CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, + CUfunction f, + void **kernelParams, + void **extra); + +/** + * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel where thread blocks + * can cooperate and synchronize as they execute + * + * Invokes the function ::CUfunction or the kernel ::CUkernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * Note that the API can also be used to launch context-less kernel ::CUkernel + * by querying the handle using ::cuLibraryGetKernel() and then passing it + * to the API by casting to ::CUfunction. Here, the context to launch + * the kernel on will either be taken from the specified stream \p hStream + * or the current context in case of NULL stream. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * Kernel parameters must be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API + * + * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * Note that the API can also be used to launch context-less kernel ::CUkernel + * by querying the handle using ::cuLibraryGetKernel() and then passing it + * to the API by casting to ::CUfunction. Here, the context to launch + * the kernel on will either be taken from the specified stream \p hStream + * or the current context in case of NULL stream. + * + * \param f - Function ::CUfunction or Kernel ::CUkernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_NOT_FOUND + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernelMultiDevice, + * ::cudaLaunchCooperativeKernel, + * ::cuLibraryGetKernel, + * ::cuKernelSetCacheConfig, + * ::cuKernelGetAttribute, + * ::cuKernelSetAttribute + */ +CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams); + +/** + * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute + * + * \deprecated This function is deprecated as of CUDA 11.3. + * + * Invokes kernels as specified in the \p launchParamsList array where each element + * of the array specifies all the parameters required to perform a single kernel launch. + * These kernels can cooperate and synchronize as they execute. The size of the array is + * specified by \p numDevices. + * + * No two kernels can be launched on the same device. All the devices targeted by this + * multi-device launch must be identical. All devices must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. + * + * All kernels launched must be identical with respect to the compiled code. Note that + * any __device__, __constant__ or __managed__ variables present in the module that owns + * the kernel launched on each device, are independently instantiated on every device. + * It is the application's responsibility to ensure these variables are initialized and + * used appropriately. + * + * The size of the grids as specified in blocks, the size of the blocks themselves + * and the amount of shared memory used by each thread block must also match across + * all launched kernels. + * + * The streams used to launch these kernels must have been created via either ::cuStreamCreate + * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD + * cannot be used. + * + * The total number of blocks launched per kernel cannot exceed the maximum number of blocks + * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the + * total number of blocks launched per device has to match across all devices, the maximum + * number of blocks that can be launched per device will be limited by the device with the + * least number of multiprocessors. + * + * The kernels cannot make use of CUDA dynamic parallelism. + * + * The ::CUDA_LAUNCH_PARAMS structure is defined as: + * \code + typedef struct CUDA_LAUNCH_PARAMS_st + { + CUfunction function; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; + } CUDA_LAUNCH_PARAMS; + * \endcode + * where: + * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must + * be identical with respect to the compiled code. + * Note that you can also specify context-less kernel ::CUkernel by querying the handle + * using ::cuLibraryGetKernel() and then casting to ::CUfunction. In this case, the context to + * launch the kernel on be taken from the specified stream ::CUDA_LAUNCH_PARAMS::hStream. + * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. + * This must match across all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot + * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated + * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. + * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If + * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams + * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through + * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual + * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes + * do not need to be specified as that information is retrieved directly from the kernel's image. + * + * By default, the kernel won't begin execution on any GPU until all prior work in all the specified + * streams has completed. This behavior can be overridden by specifying the flag + * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel + * will only wait for prior work in the stream corresponding to that GPU to complete before it begins + * execution. + * + * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin + * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying + * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, + * any subsequent work pushed in any of the specified streams will only wait for the kernel launched + * on the GPU corresponding to that stream to complete before it begins execution. + * + * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API when called individually for each + * element in \p launchParamsList. + * + * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous + * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function + * in \p launchParamsList is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param launchParamsList - List of launch parameters, one per device + * \param numDevices - Size of the \p launchParamsList array + * \param flags - Flags to control launch behavior + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernel, + * ::cudaLaunchCooperativeKernelMultiDevice + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); + +/** + * \brief Enqueues a host function call in a stream + * + * Enqueues a host function to run in a stream. The function will be called + * after currently enqueued work and will block work added after it. + * + * The host function must not make any CUDA API calls. Attempting to use a + * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. + * The host function must not perform any synchronization that may depend on + * outstanding CUDA work not mandated to run earlier. Host functions without a + * mandated order (such as in independent streams) execute in undefined order + * and may be serialized. + * + * For the purposes of Unified Memory, execution makes a number of guarantees: + *
    + *
  • The stream is considered idle for the duration of the function's + * execution. Thus, for example, the function may always use memory attached + * to the stream it was enqueued in.
  • + *
  • The start of execution of the function has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the function. It thus synchronizes streams which have been "joined" + * prior to the function.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a function might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * function call with an event.
  • + *
  • Completion of the function does not cause a stream to become + * active except as described above. The stream will remain idle + * if no device work follows the function, and will remain idle across + * consecutive host functions or stream callbacks without device work in + * between. Thus, for example, + * stream synchronization can be done by signaling from a host function at the + * end of the stream.
  • + *
+ * + * Note that, in contrast to ::cuStreamAddCallback, the function will not be + * called in the event of an error in the CUDA context. + * + * \param hStream - Stream to enqueue function call in + * \param fn - The function to call once preceding stream operations are complete + * \param userData - User-specified data to be passed to the function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + +/** @} */ /* END CUDA_EXEC */ + +/** + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] + * + * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated execution control functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the block-dimensions for the function + * + * \deprecated + * + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are + * created when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dimensions of + * \param x - X dimension + * \param y - Y dimension + * \param z - Z dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetSharedSize, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); + +/** + * \brief Sets the dynamic shared-memory size for the function + * + * \deprecated + * + * Sets through \p bytes the amount of dynamic shared memory that will be + * available to each thread block when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dynamic shared-memory size for + * \param bytes - Dynamic shared-memory size per thread in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); + +/** + * \brief Sets the parameter size for the function + * + * \deprecated + * + * Sets through \p numbytes the total size in bytes needed by the function + * parameters of the kernel corresponding to \p hfunc. + * + * \param hfunc - Kernel to set parameter size for + * \param numbytes - Size of parameter list in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); + +/** + * \brief Adds an integer parameter to the function's argument list + * + * \deprecated + * + * Sets an integer parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); + +/** + * \brief Adds a floating-point parameter to the function's argument list + * + * \deprecated + * + * Sets a floating-point parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); + +/** + * \brief Adds arbitrary data to the function's argument list + * + * \deprecated + * + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr + * into the parameter space of the kernel corresponding to \p hfunc. \p offset + * is a byte offset. + * + * \param hfunc - Kernel to add data to + * \param offset - Offset to add data to argument list + * \param ptr - Pointer to arbitrary data + * \param numbytes - Size of data to copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block + * contains the number of threads specified by a previous call to + * ::cuFuncSetBlockShape(). + * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * + * \param f - Kernel to launch + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * + * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), + * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by + * growing the per-thread stack as needed per launch and not shrinking it afterwards. + * + * \note_null_stream + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); + + +/** + * \brief Adds a texture-reference to the function's argument list + * + * \deprecated + * + * Makes the CUDA array or linear memory bound to the texture reference + * \p hTexRef available to a device program as a texture. In this version of + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. + * + * \param hfunc - Kernel to add texture-reference to + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) + * \param hTexRef - Texture-reference to add to argument list + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); +/** @} */ /* END CUDA_EXEC_DEPRECATED */ + +/** + * \defgroup CUDA_GRAPH Graph Management + * + * ___MANBRIEF___ graph management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graph management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates a graph + * + * Creates an empty graph, which is returned via \p phGraph. + * + * \param phGraph - Returns newly created graph + * \param flags - Graph creation flags, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphInstantiate, + * ::cuGraphDestroy, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); + +/** + * \brief Creates a kernel execution node and adds it to a graph + * + * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The CUDA_KERNEL_NODE_PARAMS structure is defined as: + * + * \code + * typedef struct CUDA_KERNEL_NODE_PARAMS_st { + * CUfunction func; + * unsigned int gridDimX; + * unsigned int gridDimY; + * unsigned int gridDimZ; + * unsigned int blockDimX; + * unsigned int blockDimY; + * unsigned int blockDimZ; + * unsigned int sharedMemBytes; + * void **kernelParams; + * void **extra; + * } CUDA_KERNEL_NODE_PARAMS; + * \endcode + * + * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x + * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains + * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p func can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N + * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, + * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual + * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need + * to be specified as that information is retrieved directly from the kernel's image. + * + * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single + * buffer that is passed in via \p extra. This places the burden on the application of knowing each + * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists + * to allow this function to take additional less commonly used arguments. \p extra specifies + * a list of names of extra settings and their corresponding values. Each extra setting name is + * immediately followed by the corresponding value. The list must be terminated with either NULL or + * CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer + * containing all the kernel parameters for launching kernel + * \p func; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t + * containing the size of the buffer specified with + * ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both + * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). + * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. + * + * The \p kernelParams or \p extra array, as well as the argument values it points to, + * are copied during this call. + * + * \note Kernels launched using graphs must not use texture and surface references. Reading or + * writing through any texture or surface reference is undefined behavior. + * This restriction does not apply to texture and surface objects. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the GPU execution node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuLaunchCooperativeKernel, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a kernel node's parameters + * + * Returns the parameters of kernel node \p hNode in \p nodeParams. + * The \p kernelParams or \p extra array returned in \p nodeParams, + * as well as the argument values it points to, are owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphKernelNodeSetParams to update the + * parameters of this node. + * + * The params will contain either \p kernelParams or \p extra, + * according to which of these was most recently set on the node. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams + */ +CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a kernel node's parameters + * + * Sets the parameters of kernel node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeGetParams + */ +CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a memcpy node and adds it to a graph + * + * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will perform the memcpy described by \p copyParams. + * See ::cuMemcpy3D() for a description of the structure and its restrictions. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer + * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed + * for those operand(s). The managed memory will be treated as residing on either the + * host or the device, depending on which memory type is specified. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param copyParams - Parameters for the memory copy + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Returns a memcpy node's parameters + * + * Returns the parameters of memcpy node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Sets a memcpy node's parameters + * + * Sets the parameters of memcpy node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeGetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Creates a memset node and adds it to a graph + * + * Creates a new memset node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The element size must be 1, 2, or 4 bytes. + * When the graph is launched, the node will perform the memset described by \p memsetParams. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param memsetParams - Parameters for the memory set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CONTEXT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode + */ +CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Returns a memset node's parameters + * + * Returns the parameters of memset node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a memset node's parameters + * + * Sets the parameters of memset node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeGetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a host execution node and adds it to a graph + * + * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will invoke the specified CPU function. + * Host nodes are not supported under MPS with pre-Volta GPUs. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the host node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a host node's parameters + * + * Returns the parameters of host node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams + */ +CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a host node's parameters + * + * Sets the parameters of host node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeGetParams + */ +CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a child graph node and adds it to a graph + * + * Creates a new node which executes an embedded graph, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * If \p hGraph contains allocation or free nodes, this call will return an error. + * + * The node executes an embedded child graph. The child graph is cloned in this call. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param childGraph - The graph to clone into this node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); + +/** + * \brief Gets a handle to the embedded graph of a child graph node + * + * Gets a handle to the embedded graph in a child graph node. This call + * does not clone the graph. Changes to the graph will be reflected in + * the node, and the node retains ownership of the graph. + * + * Allocation and free nodes cannot be added to the returned graph. + * Attempting to do so will return an error. + * + * \param hNode - Node to get the embedded graph for + * \param phGraph - Location to store a handle to the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); + +/** + * \brief Creates an empty node and adds it to a graph + * + * Creates a new node which performs no operation, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * An empty node performs no operation during execution, but can be used for + * transitive ordering. For example, a phased execution graph with 2 groups of n + * nodes with a barrier between them can be represented using an empty node and + * 2*n dependency edges, rather than no empty node and n^2 dependency edges. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); + +/** + * \brief Creates an event record node and adds it to a graph + * + * Creates a new event record node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * Each launch of the graph will record \p event to capture execution of the + * node's dependencies. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); + +/** + * \brief Returns the event associated with an event record node + * + * Returns the event of event record node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); + +/** + * \brief Sets an event record node's event + * + * Sets the event of event record node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); + +/** + * \brief Creates an event wait node and adds it to a graph + * + * Creates a new event wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The graph node will wait for all work captured in \p event. See ::cuEventRecord() + * for details on what is captured by an event. \p event may be from a different context + * or device than the launch stream. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); + +/** + * \brief Returns the event associated with an event wait node + * + * Returns the event of event wait node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); + +/** + * \brief Sets an event wait node's event + * + * Sets the event of event wait node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); + +/** + * \brief Creates an external semaphore signal node and adds it to a graph + * + * Creates a new external semaphore signal node and adds it to \p hGraph with \p + * numDependencies dependencies specified via \p dependencies and arguments specified + * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the + * node will be placed at the root of the graph. \p dependencies may not have any + * duplicate entries. A handle to the new node will be returned in \p phGraphNode. + * + * Performs a signal operation on a set of externally allocated semaphore objects + * when the node is launched. The operation(s) will occur after all of the node's + * dependencies have completed. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphExternalSemaphoresSignalNodeGetParams, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns an external semaphore signal node's parameters + * + * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); + +/** + * \brief Sets an external semaphore signal node's parameters + * + * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates an external semaphore wait node and adds it to a graph + * + * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * Performs a wait operation on a set of externally allocated semaphore objects + * when the node is launched. The node's dependencies will not be launched until + * the wait operation has completed. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphExternalSemaphoresWaitNodeGetParams, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Returns an external semaphore wait node's parameters + * + * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); + +/** + * \brief Sets an external semaphore wait node's parameters + * + * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a batch memory operation node and adds it to a graph + * + * Creates a new batch memory operation node and adds it to \p hGraph with \p + * numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be + * freed after the call returns. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp, + * ::cuStreamWaitValue32, + * ::cuStreamWriteValue32, + * ::cuStreamWaitValue64, + * ::cuStreamWriteValue64, + * ::cuGraphBatchMemOpNodeGetParams, + * ::cuGraphBatchMemOpNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddBatchMemOpNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a batch mem op node's parameters + * + * Returns the parameters of batch mem op node \p hNode in \p nodeParams_out. + * The \p paramArray returned in \p nodeParams_out is owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphBatchMemOpNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param nodeParams_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp, + * ::cuGraphAddBatchMemOpNode, + * ::cuGraphBatchMemOpNodeSetParams + */ +CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out); + +/** + * \brief Sets a batch mem op node's parameters + * + * Sets the parameters of batch mem op node \p hNode to \p nodeParams. + * + * The paramArray inside \p nodeParams is copied and therefore it can be + * freed after the call returns. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp, + * ::cuGraphAddBatchMemOpNode, + * ::cuGraphBatchMemOpNodeGetParams + */ +CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for a batch mem op node in the given graphExec + * + * Sets the parameters of a batch mem op node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The following fields on operations may be modified on an executable graph: + * + * op.waitValue.address + * op.waitValue.value[64] + * op.waitValue.flags bits corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified) + * op.writeValue.address + * op.writeValue.value[64] + * + * Other fields, such as the context, count or type of operations, and other types of operations such as membars, + * may not be modified. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * The paramArray inside \p nodeParams is copied and therefore it can be + * freed after the call returns. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Batch mem op node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp, + * ::cuGraphAddBatchMemOpNode, + * ::cuGraphBatchMemOpNodeGetParams, + * ::cuGraphBatchMemOpNodeSetParams, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); + +/** + * \brief Creates an allocation node and adds it to a graph + * + * Creates a new allocation node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in + * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. + * + * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, + * the allocation can be accessed by nodes ordered after the allocation node but before the free node. + * These allocations cannot be freed outside the owning graph, and they can only be freed once in the + * owning graph. + * + * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the + * graph which are ordered after the allocation node, but also by stream operations ordered after the + * graph's execution but before the allocation is freed. + * + * Allocations which are not freed in the same graph can be freed by: + * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; + * - launching a graph with a free node for that allocation; or + * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes + * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. + * + * It is not possible to free an allocation in both the owning graph and another graph. If the allocation + * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed + * in another graph, a free node can no longer be added to the owning graph. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemFreeNode, + * ::cuGraphMemAllocNodeGetParams, + * ::cuDeviceGraphMemTrim, + * ::cuDeviceGetGraphMemAttribute, + * ::cuDeviceSetGraphMemAttribute, + * ::cuMemAllocAsync, + * ::cuMemFreeAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a memory alloc node's parameters + * + * Returns the parameters of a memory alloc node \p hNode in \p params_out. + * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the + * node. This memory remains valid until the node is destroyed. The returned + * parameters must not be modified. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphMemFreeNodeGetParams + */ +CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); + +/** + * \brief Creates a memory free node and adds it to a graph + * + * Creates a new memory free node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dptr - Address of memory to free + * + * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: + * - an allocation twice in the same graph. + * - an address that was not returned by an allocation node. + * - an invalid address. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphMemFreeNodeGetParams, + * ::cuDeviceGraphMemTrim, + * ::cuDeviceGetGraphMemAttribute, + * ::cuDeviceSetGraphMemAttribute, + * ::cuMemAllocAsync, + * ::cuMemFreeAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); + +/** + * \brief Returns a memory free node's parameters + * + * Returns the address of a memory free node \p hNode in \p dptr_out. + * + * \param hNode - Node to get the parameters for + * \param dptr_out - Pointer to return the device address + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemFreeNode, + * ::cuGraphMemAllocNodeGetParams + */ +CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); + +/** + * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. + * + * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are + * freed back to the operating system. + * + * \param device - The device for which cached memory should be freed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode, + * ::cuDeviceSetGraphMemAttribute, + * ::cuDeviceGetGraphMemAttribute + */ +CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); + +/** + * \brief Query asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - retrieved value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuDeviceSetGraphMemAttribute, + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode + */ +CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); + +/** + * \brief Set asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - pointer to value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuDeviceGetGraphMemAttribute, + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode + */ +CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); + +/** + * \brief Clones a graph + * + * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. + * All parameters are copied into the cloned graph. The original graph may be modified + * after this call without affecting the clone. + * + * Child graph nodes in the original graph are recursively copied into the clone. + * + * \param phGraphClone - Returns newly created cloned graph + * \param originalGraph - Graph to clone + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); + +/** + * \brief Finds a cloned version of a node + * + * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode + * in the original graph. + * + * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. + * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to + * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have + * been removed. The cloned node is then returned via \p phClonedNode. + * + * \param phNode - Returns handle to the cloned node + * \param hOriginalNode - Handle to the original node + * \param hClonedGraph - Cloned graph to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); + +/** + * \brief Returns a node's type + * + * Returns the node type of \p hNode in \p type. + * + * \param hNode - Node to query + * \param type - Pointer to return the node type + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); + +/** + * \brief Returns a graph's nodes + * + * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this + * function will return the number of nodes in \p numNodes. Otherwise, + * \p numNodes entries will be filled in. If \p numNodes is higher than the actual + * number of nodes, the remaining entries in \p nodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numNodes. + * + * \param hGraph - Graph to query + * \param nodes - Pointer to return the nodes + * \param numNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); + +/** + * \brief Returns a graph's root nodes + * + * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this + * function will return the number of root nodes in \p numRootNodes. Otherwise, + * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual + * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numRootNodes. + * + * \param hGraph - Graph to query + * \param rootNodes - Pointer to return the root nodes + * \param numRootNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); + +/** + * \brief Returns a graph's dependency edges + * + * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding + * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the + * node in \p from[i]. \p from and \p to may both be NULL, in which + * case this function only returns the number of edges in \p numEdges. Otherwise, + * \p numEdges entries will be filled in. If \p numEdges is higher than the actual + * number of edges, the remaining entries in \p from and \p to will be set to NULL, and + * the number of edges actually returned will be written to \p numEdges. + * + * \param hGraph - Graph to get the edges from + * \param from - Location to return edge endpoints + * \param to - Location to return edge endpoints + * \param numEdges - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); + +/** + * \brief Returns a node's dependencies + * + * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this + * function will return the number of dependencies in \p numDependencies. Otherwise, + * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual + * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numDependencies. + * + * \param hNode - Node to query + * \param dependencies - Pointer to return the dependencies + * \param numDependencies - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependentNodes, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); + +/** + * \brief Returns a node's dependent nodes + * + * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which + * case this function will return the number of dependent nodes in \p numDependentNodes. + * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is + * higher than the actual number of dependent nodes, the remaining entries in + * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will + * be returned in \p numDependentNodes. + * + * \param hNode - Node to query + * \param dependentNodes - Pointer to return the dependent nodes + * \param numDependentNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependencies, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); + +/** + * \brief Adds dependency edges to a graph + * + * The number of dependencies to be added is defined by \p numDependencies + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying an existing dependency will return an error. + * + * \param hGraph - Graph to which dependencies are added + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be added + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphRemoveDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); + +/** + * \brief Removes dependency edges from a graph + * + * The number of \p dependencies to be removed is defined by \p numDependencies. + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying a non-existing dependency will return an error. + * + * Dependencies cannot be removed from graphs which contain allocation or free nodes. + * Any attempt to do so will return an error. + * + * \param hGraph - Graph from which to remove dependencies + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be removed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); + +/** + * \brief Remove a node from the graph + * + * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes + * on \p hNode and vice versa. + * + * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. + * Any attempt to do so will return an error. + * + * \param hNode - Node to remove + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p phGraphExec. + * + * The \p flags parameter controls the behavior of instantiation and subsequent + * graph launches. Valid flags are: + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a + * graph containing memory allocation nodes to automatically free any + * unfreed memory allocations before the graph is relaunched. + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch + * from the device. If this flag is passed, the executable graph handle returned can be + * used to launch the graph from both the host and device. This flag can only be used + * on platforms which support unified addressing. This flag cannot be used in + * conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH. + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph + * to use the priorities from the per-node attributes rather than the priority + * of the launch stream during execution. Note that priorities are only available + * on kernel nodes, and are copied from stream priority during stream capture. + * + * If \p hGraph contains any allocation or free nodes, there can be at most one + * executable graph in existence for that graph at a time. An attempt to instantiate + * a second executable graph before destroying the first with ::cuGraphExecDestroy + * will result in an error. + * + * If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple + * contexts, this will result in an error. + * + * Graphs instantiated for launch on the device have additional restrictions which do not + * apply to host graphs: + * + * - The graph's nodes must reside on a single context. + * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes. + * Operation-specific restrictions are outlined below. + * - Kernel nodes: + * - Use of CUDA Dynamic Parallelism is not permitted. + * - Cooperative launches are permitted as long as MPS is not in use. + * - Memcpy nodes: + * - Only copies involving device memory and/or pinned device-mapped host memory are permitted. + * - Copies involving CUDA arrays are not permitted. + * - Both operands must be accessible from the current context, and the current context must + * match the context of other nodes in the graph. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphCreate, + * ::cuGraphUpload, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph according to the \p instantiateParams structure. + * The graph is validated for any structural constraints or intra-node constraints + * which were not previously validated. If instantiation is successful, a handle to + * the instantiated graph is returned in \p phGraphExec. + * + * \p instantiateParams controls the behavior of instantiation and subsequent + * graph launches, as well as returning more detailed information in the event of an error. + * ::CUDA_GRAPH_INSTANTIATE_PARAMS is defined as: + * + * \code + typedef struct { + cuuint64_t flags; + CUstream hUploadStream; + CUgraphNode hErrNode_out; + CUgraphInstantiateResult result_out; + } CUDA_GRAPH_INSTANTIATE_PARAMS; + * \endcode + * + * The \p flags field controls the behavior of instantiation and subsequent + * graph launches. Valid flags are: + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a + * graph containing memory allocation nodes to automatically free any + * unfreed memory allocations before the graph is relaunched. + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD, which will perform an upload of the graph + * into \p hUploadStream once the graph has been instantiated. + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch + * from the device. If this flag is passed, the executable graph handle returned can be + * used to launch the graph from both the host and device. This flag can only be used + * on platforms which support unified addressing. This flag cannot be used in + * conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH. + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph + * to use the priorities from the per-node attributes rather than the priority + * of the launch stream during execution. Note that priorities are only available + * on kernel nodes, and are copied from stream priority during stream capture. + * + * If \p hGraph contains any allocation or free nodes, there can be at most one + * executable graph in existence for that graph at a time. An attempt to instantiate a + * second executable graph before destroying the first with ::cuGraphExecDestroy will + * result in an error. + * + * If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple + * contexts, this will result in an error. + * + * Graphs instantiated for launch on the device have additional restrictions which do not + * apply to host graphs: + * + * - The graph's nodes must reside on a single context. + * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes. + * Operation-specific restrictions are outlined below. + * - Kernel nodes: + * - Use of CUDA Dynamic Parallelism is not permitted. + * - Cooperative launches are permitted as long as MPS is not in use. + * - Memcpy nodes: + * - Only copies involving device memory and/or pinned device-mapped host memory are permitted. + * - Copies involving CUDA arrays are not permitted. + * - Both operands must be accessible from the current context, and the current context must + * match the context of other nodes in the graph. + * + * In the event of an error, the \p result_out and \p hErrNode_out fields will contain more + * information about the nature of the error. Possible error reporting includes: + * + * - ::CUDA_GRAPH_INSTANTIATE_ERROR, if passed an invalid value or if an unexpected error occurred + * which is described by the return value of the function. \p hErrNode_out will be set to NULL. + * - ::CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE, if the graph structure is invalid. \p hErrNode_out + * will be set to one of the offending nodes. + * - ::CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED, if the graph is instantiated for device + * launch but contains a node of an unsupported node type, or a node which performs unsupported + * operations, such as use of CUDA dynamic parallelism within a kernel node. \p hErrNode_out will + * be set to this node. + * - ::CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED, if the graph is instantiated for device + * launch but a node’s context differs from that of another node. This error can also be returned + * if a graph is not instantiated for device launch and it contains kernels which call device-side + * cudaGraphLaunch() from multiple contexts. \p hErrNode_out will be set to this node. + * + * If instantiation is successful, \p result_out will be set to ::CUDA_GRAPH_INSTANTIATE_SUCCESS, + * and \p hErrNode_out will be set to NULL. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param instantiateParams - Instantiation parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphInstantiate, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams); + +/** + * \brief Query the instantiation flags of an executable graph + * + * Returns the flags that were passed to instantiation for the given executable graph. + * ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD will not be returned by this API as it does + * not affect the resulting executable graph. + * + * \param hGraphExec - The executable graph to query + * \param flags - Returns the instantiation flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphInstantiateWithParams + */ +CUresult CUDAAPI cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t *flags); + +/** + * \brief Sets the parameters for a kernel node in the given graphExec + * + * Sets the parameters of a kernel node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. All \p nodeParams + * fields may change, but the following restrictions apply to \p func updates: + * + * - The owning context of the function cannot change. + * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + * to a function which uses CDP + * - If \p hGraphExec was not instantiated for device launch, a node whose function originally + * did not use device-side cudaGraphLaunch() cannot be updated to a function which uses + * device-side cudaGraphLaunch() unless the node resides on the same context as nodes which + * contained such calls at instantiate-time. If no such calls were present at instantiation, + * these updates cannot be performed at all. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - kernel node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p copyParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The source and destination memory in \p copyParams must be allocated from the same + * contexts as the original source and destination memory. Both the instantiation-time + * memory operands and the memory operands in \p copyParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or + * either the original or new memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Memcpy node from the graph which was used to instantiate graphExec + * \param copyParams - The updated parameters to set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Sets the parameters for a memset node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p memsetParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The destination memory in \p memsetParams must be allocated from the same + * contexts as the original destination memory. Both the instantiation-time + * memory operand and the memory operand in \p memsetParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or + * either the original or new memory operand are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Memset node from the graph which was used to instantiate graphExec + * \param memsetParams - The updated parameters to set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Sets the parameters for a host node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p nodeParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Host node from the graph which was used to instantiate graphExec + * \param nodeParams - The updated parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Updates node parameters in the child graph node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained + * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. + * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. + * Changed edges to and from \p hNode are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p hNode is also + * not modified by this call. + * + * The topology of \p childGraph, as well as the node insertion order, must match that + * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions + * on what can be updated in an instantiated graph. The update is recursive, so child graph + * nodes contained within the top level child graph will also be updated. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Host node from the graph which was used to instantiate graphExec + * \param childGraph - The graph supplying the updated parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); + +/** + * \brief Sets the event for an event record node in the given graphExec + * + * Sets the event of an event record node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - event record node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); + +/** + * \brief Sets the event for an event wait node in the given graphExec + * + * Sets the event of an event wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - event wait node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); + +/** + * \brief Sets the parameters for an external semaphore signal node in the given graphExec + * + * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore signal node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for an external semaphore wait node in the given graphExec + * + * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore wait node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Enables or disables the specified node in the given graphExec + * + * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent + * to empty nodes until they are reenabled. Existing node parameters are not affected by + * disabling/enabling the node. + * + * The node is identified by the corresponding node \p hNode in the non-executable + * graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \note Currently only kernel, memset and memcpy nodes are supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Node from the graph from which graphExec was instantiated + * \param isEnabled - Node is enabled if != 0, otherwise the node is disabled + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetEnabled, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled); + +/** + * \brief Query whether a node in the given graphExec is enabled + * + * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. + * + * The node is identified by the corresponding node \p hNode in the non-executable + * graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * \note Currently only kernel, memset and memcpy nodes are supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Node from the graph from which graphExec was instantiated + * \param isEnabled - Location to return the enabled status of the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeSetEnabled, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled); + +/** + * \brief Uploads an executable graph in a stream + * + * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of + * the same \p hGraphExec will be serialized. Each upload is ordered behind both any + * previous work in \p hStream and any previous launches of \p hGraphExec. + * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. + * + * \param hGraphExec - Executable graph to upload + * \param hStream - Stream in which to upload the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); + +/** + * \brief Launches an executable graph in a stream + * + * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing + * at a time. Each launch is ordered behind both any previous work in \p hStream + * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be + * instantiated multiple times into multiple executable graphs. + * + * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and + * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, + * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. + * + * \param hGraphExec - Executable graph to launch + * \param hStream - Stream in which to launch the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphUpload, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); + +/** + * \brief Destroys an executable graph + * + * Destroys the executable graph specified by \p hGraphExec, as well + * as all of its executable nodes. If the executable graph is + * in-flight, it will not be terminated, but rather freed + * asynchronously on completion. + * + * \param hGraphExec - Executable graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphUpload, + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); + +/** + * \brief Destroys a graph + * + * Destroys the graph specified by \p hGraph, as well as all of its nodes. + * + * \param hGraph - Graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); + +/** + * \brief Check whether an executable graph can be updated with a graph and perform the update if possible + * + * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the + * node parameters in a topologically identical graph specified by \p hGraph. + * + * Limitations: + * + * - Kernel nodes: + * - The owning context of the function cannot change. + * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + * to a function which uses CDP. + * - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. + * - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the + * priority attribute cannot change. Equality is checked on the originally requested + * priority values, before they are clamped to the device's supported range. + * - If \p hGraphExec was not instantiated for device launch, a node whose function originally + * did not use device-side cudaGraphLaunch() cannot be updated to a function which uses + * device-side cudaGraphLaunch() unless the node resides on the same context as nodes which + * contained such calls at instantiate-time. If no such calls were present at instantiation, + * these updates cannot be performed at all. + * - Memset and memcpy nodes: + * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. + * - The source/destination memory must be allocated from the same contexts as the original + * source/destination memory. + * - Only 1D memsets can be changed. + * - Additional memcpy node restrictions: + * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, + * CU_MEMORYTYPE_ARRAY, etc.) is not supported. + * - External semaphore wait nodes and record nodes: + * - Changing the number of semaphores is not supported. + * + * Note: The API may add further restrictions in future releases. The return code should always be checked. + * + * cuGraphExecUpdate sets the result member of \p resultInfo to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED + * under the following conditions: + * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode + * is set to NULL. + * - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of + * the exit nodes in hGraph. + * - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with, + * in which case resultInfo->errorNode is set to the node from \p hGraph. + * - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node + * from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode + * will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency + * does not match when the nodes are already paired based on other edges examined in the graph. + * + * cuGraphExecUpdate sets the result member of \p resultInfo to: + * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. + * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed + * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case + * \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported + * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph + * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like + * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph + * + * If the update fails for a reason not listed above, the result member of \p resultInfo will be set + * to CU_GRAPH_EXEC_UPDATE_ERROR. If the update succeeds, the result member will be set to CU_GRAPH_EXEC_UPDATE_SUCCESS. + * + * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns + * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + * + * \param hGraphExec The instantiated graph to be updated + * \param hGraph The graph containing the updated parameters + * \param resultInfo the error info structure + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo *resultInfo); + +/** + * \brief Copies attributes from source node to destination node. + * + * Copies attributes from source node \p src to destination node \p dst. + * Both node must have the same context. + * + * \param[out] dst Destination node + * \param[in] src Source node + * For list of attributes see ::CUkernelNodeAttrID + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); + +/** + * \brief Queries node attribute. + * + * Queries attribute \p attr from node \p hNode and stores it in corresponding + * member of \p value_out. + * + * \param[in] hNode + * \param[in] attr + * \param[out] value_out + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, + CUkernelNodeAttrValue *value_out); + +/** + * \brief Sets node attribute. + * + * Sets attribute \p attr on node \p hNode from corresponding attribute of + * \p value. + * + * \param[out] hNode + * \param[in] attr + * \param[out] value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, + const CUkernelNodeAttrValue *value); + +/** + * \brief Write a DOT file describing graph structure + * + * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. + * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. + * \p flags can be specified to write more detailed information about each node type such as + * parameter values, kernel attributes, node and function handles. + * + * \param hGraph - The graph to create a DOT file from + * \param path - The path to write the DOT file to + * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OPERATING_SYSTEM + */ +CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); + +/** + * \brief Create a user object + * + * Create a user object with the specified destructor callback and initial reference count. The + * initial references are owned by the caller. + * + * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they + * are executed by a shared internal thread. Another thread may be signaled to perform such + * actions, if it does not block forward progress of tasks scheduled through CUDA. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object_out - Location to return the user object handle + * \param ptr - The pointer to pass to the destroy function + * \param destroy - Callback to free the user object when it is no longer in use + * \param initialRefcount - The initial refcount to create the object with, typically 1. The + * initial references are owned by the calling thread. + * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, + * which is the only defined flag. This indicates that the destroy + * callback cannot be waited on by any CUDA API. Users requiring + * synchronization of the callback should signal its completion + * manually. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, + unsigned int initialRefcount, unsigned int flags); + +/** + * \brief Retain a reference to a user object + * + * Retains new references to a user object. The new references are owned by the caller. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to retain + * \param count - The number of references to retain, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); + +/** + * \brief Release a reference to a user object + * + * Releases user object references owned by the caller. The object's destructor is invoked if + * the reference count reaches zero. + * + * It is undefined behavior to release references not owned by the caller, or to use a user + * object handle after all references are released. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to release + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); + +/** + * \brief Retain a reference to a user object from a graph + * + * Creates or moves user object references that will be owned by a CUDA graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph to associate the reference with + * \param object - The user object to retain a reference for + * \param count - The number of references to add to the graph, typically 1. Must be + * nonzero and not larger than INT_MAX. + * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references + * from the calling thread, rather than create new references. Pass 0 + * to create new references. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); + +/** + * \brief Release a user object reference from a graph + * + * Releases user object references owned by a graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph that will release the reference + * \param object - The user object to release a reference for + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); + +/** @} */ /* END CUDA_GRAPH */ + +/** + * \defgroup CUDA_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, if caching is enabled, but + * per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching + * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes + * the occupancy calculator to return 0 in such cases. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * Returns in \p *blockSize a reasonable block size that can achieve + * the maximum occupancy (or, the maximum number of active warps with + * the fewest blocks per multiprocessor), and in \p *minGridSize the + * minimum grid size to achieve the maximum occupancy. + * + * If \p blockSizeLimit is 0, the configurator will use the maximum + * block size permitted by the device / function instead. + * + * If per-block dynamic shared memory allocation is not needed, the + * user should leave both \p blockSizeToDynamicSMemSize and \p + * dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if + * the dynamic shared memory size is constant regardless of block + * size, the size should be passed through \p dynamicSMemSize, and \p + * blockSizeToDynamicSMemSize should be NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with + * different block sizes, the user needs to provide a unary function + * through \p blockSizeToDynamicSMemSize that computes the dynamic + * shared memory needed by \p func for any given block size. \p + * dynamicSMemSize is ignored. An example signature is: + * + * \code + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * \endcode + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSize + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * An extended version of ::cuOccupancyMaxPotentialBlockSize. In + * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, + * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags + * parameter. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxPotentialBlockSize; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, the launch configurations that + * produces maximal occupancy might not support global + * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE + * guarantees that the the produced launch configuration is global + * caching compatible at a potential cost of occupancy. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * \param flags - Options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); + +/** + * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM + * + * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + * + * \param dynamicSmemSize - Returned maximum dynamic shared memory + * \param func - Kernel function for which occupancy is calculated + * \param numBlocks - Number of blocks to fit on SM + * \param blockSize - Size of the blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + */ +CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum cluster size in \p *clusterSize. + * + * The cluster dimensions in \p config are ignored. If func has a required + * cluster size set (see ::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p + * *clusterSize will reflect the required cluster size. + * + * By default this function will always return a value that's portable on + * future hardware. A higher value may be returned if the kernel function + * allows non-portable cluster sizes. + * + * This function will respect the compile time launch bounds. + * + * \param clusterSize - Returned maximum cluster size that can be launched + * for the given kernel function and launch configuration + * \param func - Kernel function for which maximum cluster + * size is calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaFuncGetAttributes, + * ::cuFuncGetAttribute + */ +CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction func, const CUlaunchConfig *config); + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum number of clusters that could co-exist + * on the target device in \p *numClusters. + * + * If the function has required cluster size already set (see + * ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size + * from config must either be unspecified or match the required size. + * Without required sizes, the cluster size must be specified in config, + * else the function will return an error. + * + * Note that various attributes of the kernel function may affect occupancy + * calculation. Runtime environment may affect how the hardware schedules + * the clusters, so the calculated occupancy is not guaranteed to be achievable. + * + * \param numClusters - Returned maximum number of clusters that + * could co-exist on the target device + * \param func - Kernel function for which maximum number + * of clusters are calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CLUSTER_SIZE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaFuncGetAttributes, + * ::cuFuncGetAttribute + */ +CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func, const CUlaunchConfig *config); +/** @} */ /* END CUDA_OCCUPANCY */ + +/** + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated texture reference management functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated texture reference management + * functions of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Binds an array as a texture reference + * + * \deprecated + * + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is + * unbound. + * + * \param hTexRef - Texture reference to bind + * \param hArray - Array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Binds a mipmapped array to a texture reference + * + * \deprecated + * + * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. + * Any previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. + * Any CUDA array previously bound to \p hTexRef is unbound. + * + * \param hTexRef - Texture reference to bind + * \param hMipmappedArray - Mipmapped array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); + +/** + * \brief Binds an address as a texture reference + * + * \deprecated + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cuTexRefSetAddress() passes back a byte offset in + * \p *ByteOffset that must be applied to texture fetches in order to read from + * the desired memory. This offset must be divided by the texel size and + * passed to kernels that read from the texture so they can be applied to the + * ::tex1Dfetch() function. + * + * If the device memory pointer was returned from ::cuMemAlloc(), the offset + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. + * The number of elements is computed as (\p bytes / bytesPerElement), + * where bytesPerElement is determined from the data format and number of + * components set using ::cuTexRefSetFormat(). + * + * \param ByteOffset - Returned byte offset + * \param hTexRef - Texture reference to bind + * \param dptr - Device pointer to bind + * \param bytes - Size of memory to bind in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); + +/** + * \brief Binds an address as a 2D texture reference + * + * \deprecated + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Using a ::tex2D() function inside a kernel requires a call to either + * ::cuTexRefSetArray() to bind the corresponding texture reference to an + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear + * memory. + * + * Function calls to ::cuTexRefSetFormat() cannot follow calls to + * ::cuTexRefSetAddress2D() for the same texture reference. + * + * It is required that \p dptr be aligned to the appropriate hardware-specific + * texture alignment. You can query this value using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. + * This value can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Width and Height, which are specified in elements (or texels), cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * \p Pitch, which is specified in bytes, cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * \param hTexRef - Texture reference to bind + * \param desc - Descriptor of CUDA array + * \param dptr - Device pointer to bind + * \param Pitch - Line pitch in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); + +/** + * \brief Sets the format for a texture reference + * + * \deprecated + * + * Specifies the format of the data to be read by the texture reference + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: + * They specify the format of each component and the number of components per + * array element. + * + * \param hTexRef - Texture reference + * \param fmt - Format to set + * \param NumPackedComponents - Number of components per array element + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaCreateChannelDesc + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); + +/** + * \brief Sets the addressing mode for a texture reference + * + * \deprecated + * + * Specifies the addressing mode \p am for the given dimension \p dim of the + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is + * applied to the first parameter of the functions used to fetch from the + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined + * as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * \param hTexRef - Texture reference + * \param dim - Dimension + * \param am - Addressing mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); + +/** + * \brief Sets the filtering mode for a texture reference + * + * \deprecated + * + * Specifies the filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap filtering mode for a texture reference + * + * \deprecated + * + * Specifies the mipmap filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap level bias for a texture reference + * + * \deprecated + * + * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when + * reading memory through the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param bias - Mipmap level bias + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); + +/** + * \brief Sets the mipmap min/max mipmap level clamps for a texture reference + * + * \deprecated + * + * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp + * respectively, to be used when reading memory through the texture reference + * \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param minMipmapLevelClamp - Mipmap min level clamp + * \param maxMipmapLevelClamp - Mipmap max level clamp + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); + +/** + * \brief Sets the maximum anisotropy for a texture reference + * + * \deprecated + * + * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through + * the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param maxAniso - Maximum anisotropy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); + +/** + * \brief Sets the border color for a texture reference + * + * \deprecated + * + * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + * \p hTexRef. The color value supports only float type and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * Note that the color values can be set only when the Address mode is set to + * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. + * Applications using integer border color values have to "reinterpret_cast" their values to float. + * + * \param hTexRef - Texture reference + * \param pBorderColor - RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddressMode, + * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); + +/** + * \brief Sets the flags for a texture reference + * + * \deprecated + * + * Specifies optional flags via \p Flags to specify the behavior of data + * returned through the texture reference \p hTexRef. The valid flags are: + * + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format + * would not be promoted, regardless of whether or not this + * flag is specified; + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the + * default behavior of having the texture coordinates range + * from [0, Dim) where Dim is the width or height of the CUDA + * array. Instead, the texture coordinates [0, 1.0) reference + * the entire breadth of the array dimension; + * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear + * filtering optimizations. Trilinear optimizations improve texture filtering + * performance by allowing bilinear filtering on textures in scenarios where + * it can closely approximate the expected results. + * + * \param hTexRef - Texture reference + * \param Flags - Optional flags to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); + +/** + * \brief Gets the address associated with a texture reference + * + * \deprecated + * + * Returns in \p *pdptr the base address bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any device memory range. + * + * \param pdptr - Returned device address + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); + +/** + * \brief Gets the array bound to a texture reference + * + * \deprecated + * + * Returns in \p *phArray the CUDA array bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA array. + * + * \param phArray - Returned array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); + +/** + * \brief Gets the mipmapped array bound to a texture reference + * + * \deprecated + * + * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture + * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA mipmapped array. + * + * \param phMipmappedArray - Returned mipmapped array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); + +/** + * \brief Gets the addressing mode used by a texture reference + * + * \deprecated + * + * Returns in \p *pam the addressing mode corresponding to the + * dimension \p dim of the texture reference \p hTexRef. Currently, the only + * valid value for \p dim are 0 and 1. + * + * \param pam - Returned addressing mode + * \param hTexRef - Texture reference + * \param dim - Dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); + +/** + * \brief Gets the filter-mode used by a texture reference + * + * \deprecated + * + * Returns in \p *pfm the filtering mode of the texture reference + * \p hTexRef. + * + * \param pfm - Returned filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the format used by a texture reference + * + * \deprecated + * + * Returns in \p *pFormat and \p *pNumChannels the format and number + * of components of the CUDA array bound to the texture reference \p hTexRef. + * If \p pFormat or \p pNumChannels is NULL, it will be ignored. + * + * \param pFormat - Returned format + * \param pNumChannels - Returned number of components + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); + +/** + * \brief Gets the mipmap filtering mode for a texture reference + * + * \deprecated + * + * Returns the mipmap filtering mode in \p pfm that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pfm - Returned mipmap filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the mipmap level bias for a texture reference + * + * \deprecated + * + * Returns the mipmap level bias in \p pBias that's added to the specified mipmap + * level when reading memory through the texture reference \p hTexRef. + * + * \param pbias - Returned mipmap level bias + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); + +/** + * \brief Gets the min/max mipmap level clamps for a texture reference + * + * \deprecated + * + * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp + * that's used when reading memory through the texture reference \p hTexRef. + * + * \param pminMipmapLevelClamp - Returned mipmap min level clamp + * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); + +/** + * \brief Gets the maximum anisotropy for a texture reference + * + * \deprecated + * + * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pmaxAniso - Returned maximum anisotropy + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); + +/** + * \brief Gets the border color used by a texture reference + * + * \deprecated + * + * Returns in \p pBorderColor, values of the RGBA color used by + * the texture reference \p hTexRef. + * The color value is of type float and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * \param hTexRef - Texture reference + * \param pBorderColor - Returned Type and Value of RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddressMode, + * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); + +/** + * \brief Gets the flags used by a texture reference + * + * \deprecated + * + * Returns in \p *pFlags the flags of the texture reference \p hTexRef. + * + * \param pFlags - Returned flags + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); + +/** + * \brief Creates a texture reference + * + * \deprecated + * + * Creates a texture reference and returns its handle in \p *pTexRef. Once + * created, the application must call ::cuTexRefSetArray() or + * ::cuTexRefSetAddress() to associate the reference with allocated memory. + * Other texture reference functions are used to specify the format and + * interpretation (addressing, filtering, etc.) to be used when the memory is + * read through this texture reference. + * + * \param pTexRef - Returned texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefDestroy + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); + +/** + * \brief Destroys a texture reference + * + * \deprecated + * + * Destroys the texture reference specified by \p hTexRef. + * + * \param hTexRef - Texture reference to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefCreate + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF_DEPRECATED */ + + +/** + * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] + * + * ___MANBRIEF___ surface reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the CUDA array for a surface reference. + * + * \deprecated + * + * Sets the CUDA array \p hArray to be read and written by the surface reference + * \p hSurfRef. Any previous CUDA array state associated with the surface + * reference is superseded by this function. \p Flags must be set to 0. + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. + * Any CUDA array previously bound to \p hSurfRef is unbound. + + * \param hSurfRef - Surface reference handle + * \param hArray - CUDA array handle + * \param Flags - set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuModuleGetSurfRef, + * ::cuSurfRefGetArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Passes back the CUDA array bound to a surface reference. + * + * \deprecated + * + * Returns in \p *phArray the CUDA array bound to the surface reference + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference + * is not bound to any CUDA array. + + * \param phArray - Surface reference handle + * \param hSurfRef - Surface reference handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); + +/** @} */ /* END CUDA_SURFREF_DEPRECATED */ + +/** + * \defgroup CUDA_TEXOBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture object management functions of the + * low-level CUDA driver application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::CUDA_RESOURCE_DESC structure is defined as: + * \code + typedef struct CUDA_RESOURCE_DESC_st + { + CUresourcetype resType; + + union { + struct { + CUarray hArray; + } array; + struct { + CUmipmappedArray hMipmappedArray; + } mipmap; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t sizeInBytes; + } linear; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + + unsigned int flags; + } CUDA_RESOURCE_DESC; + + * \endcode + * where: + * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, + CU_RESOURCE_TYPE_LINEAR = 0x02, + CU_RESOURCE_TYPE_PITCH2D = 0x03 + } CUresourcetype; + * \endcode + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray + * must be set to a valid CUDA mipmapped array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width + * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * - ::flags must be set to zero. + * + * + * The ::CUDA_TEXTURE_DESC struct is defined as + * \code + typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; + CUfilter_mode filterMode; + unsigned int flags; + unsigned int maxAnisotropy; + CUfilter_mode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + } CUDA_TEXTURE_DESC; + * \endcode + * where + * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES + * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. + * + * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format would not be + * promoted, regardless of whether or not this flag is specified. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior + * of having the texture coordinates range from [0, Dim) where Dim is the + * width or height of the CUDA array. Instead, the texture coordinates + * [0, 1.0) reference the entire breadth of the array dimension; Note that + * for CUDA mipmapped arrays, this flag has to be set. + * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear + * filtering optimizations. Trilinear optimizations improve texture filtering + * performance by allowing bilinear filtering on textures in scenarios where + * it can closely approximate the expected results. + * - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. + * This flag can only be specified if the underlying resource is a CUDA array + * or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP. + * When seamless cube map filtering is enabled, texture address modes specified + * by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode + * is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP + * will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is + * set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed + * when sampling along the cube face borders. + * + * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * + * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as + * \code + typedef struct CUDA_RESOURCE_VIEW_DESC_st + { + CUresourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + } CUDA_RESOURCE_VIEW_DESC; + * \endcode + * where: + * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base + * format but with 4 channels. + * + * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectDestroy, + * ::cudaCreateTextureObject + */ +CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaDestroyTextureObject + */ +CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceDesc, + */ +CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectTextureDesc + */ +CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceViewDesc + */ +CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); + +/** @} */ /* END CUDA_TEXOBJECT */ + +/** + * \defgroup CUDA_SURFOBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface object management functions of the + * low-level CUDA driver application programming interface. The surface + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be + * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectDestroy, + * ::cudaCreateSurfaceObject + */ +CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaDestroySurfaceObject + */ +CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaGetSurfaceObjectResourceDesc + */ +CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); + +/** @} */ /* END CUDA_SURFOBJECT */ + +/** + * \defgroup CUDA_TENSOR_MEMORY Tensor Core Managment + * + * ___MANBRIEF___ tensor core management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the tensor core management functions of the + * low-level CUDA driver application programming interface. The tensor + * core API is only supported on devices of compute capability 9.0 or higher. + * + * @{ + */ + +/** + * \brief Create a tensor map descriptor object representing tiled memory region + * + * Creates a descriptor for Tensor Memory Access (TMA) object specified + * by the parameters describing a tiled region and returns it in \p tensorMap. + * + * Tensor map objects are only supported on devices of compute capability 9.0 or higher. + * Additionally, a tensor map object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The parameters passed are bound to the following requirements: + * + * - \p tensorMap address must be aligned to 64 bytes. + * + * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as: + * \code + typedef enum CUtensorMapDataType_enum { + CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, // 1 byte + CU_TENSOR_MAP_DATA_TYPE_UINT16, // 2 bytes + CU_TENSOR_MAP_DATA_TYPE_UINT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_INT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_UINT64, // 8 bytes + CU_TENSOR_MAP_DATA_TYPE_INT64, // 8 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT16, // 2 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT64, // 8 bytes + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, // 2 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes + } CUtensorMapDataType; + * \endcode + * + * - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not + * ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3. + * + * - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is + * ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise. + * + * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or + * equal to 2^32. + * + * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a + * multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B. + * Each following dimension specified includes previous dimension stride: + * \code + globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0]; + for (i = 1; i < tensorRank - 1; i++) + globalStrides[i] = globalStrides[i – 1] * globalStrides[i] + padding[i]; + assert(globalStrides[i] >= globalDim[i]); + * \endcode + * + * - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be less + * than or equal to 8. + * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple + * of 16 bytes. + * + * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less + * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since + * TMA doesn’t support the stride for dimension zero. + * When all elemets of \p elementStrides array is one, \p boxDim specifies the number of elements to load. However, if the \p elementStrides[i] + * is not equal to one, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension. To load N elements along + * i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i]. + * + * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as: + * \code + typedef enum CUtensorMapInterleave_enum { + CU_TENSOR_MAP_INTERLEAVE_NONE = 0, + CU_TENSOR_MAP_INTERLEAVE_16B, + CU_TENSOR_MAP_INTERLEAVE_32B + } CUtensorMapInterleave; + * \endcode + * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16 + * uses 32 bytes. + * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension + * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size. + * - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32. + * - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64. + * - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128. + * + * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as: + * \code + typedef enum CUtensorMapSwizzle_enum { + CU_TENSOR_MAP_SWIZZLE_NONE = 0, + CU_TENSOR_MAP_SWIZZLE_32B, + CU_TENSOR_MAP_SWIZZLE_64B, + CU_TENSOR_MAP_SWIZZLE_128B + } CUtensorMapSwizzle; + * \endcode + * Data is organized in specific order in global memory; however, it may not match the order in which data are accessed by application in + * the shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this + * problem, data can be loaded to shard memory with shuffling across shared memory banks. + * Note that it’s expected that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle should be ::CU_TENSOR_MAP_SWIZZLE_32B mode. + * Other interleave modes can have any swizzling patterns. + * + * - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of + * type ::CUtensorMapL2promotion, which is defined as: + * \code + typedef enum CUtensorMapL2promotion_enum { + CU_TENSOR_MAP_L2_PROMOTION_NONE = 0, + CU_TENSOR_MAP_L2_PROMOTION_L2_64B, + CU_TENSOR_MAP_L2_PROMOTION_L2_128B, + CU_TENSOR_MAP_L2_PROMOTION_L2_256B + } CUtensorMapL2promotion; + * \endcode + * + * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type + * ::CUtensorMapFloatOOBfill which is defined as: + * \code + typedef enum CUtensorMapFloatOOBfill_enum { + CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0, + CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA + } CUtensorMapFloatOOBfill; + * \endcode + * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating data type. + * + * \param tensorMap - Tensor map object to create + * \param tensorDataType - Tensor data type + * \param tensorRank - Dimensionality of tensor + * \param globalAddress - Starting address of memory region described by tensor + * \param globalDim - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions + * \param globalStrides - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions + * \param boxDim - Array containing traversal box size (number of elments) along each of the \p tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension. + * \param elementStrides - Array containing traversal stride in each of the \p tensorRank dimensions + * \param interleave - Type of interleaved layout the tensor addresses + * \param swizzle - Bank swizzling pattern inside shared memory + * \param l2Promotion - L2 promotion size + * \param oobFill - Indicate whether zero or special NaN constant must be used to fill out-of-bound elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTensorMapEncodeIm2col, + * ::cuTensorMapReplaceAddress + */ +CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill); + + +/** + * \brief Create a tensor map descriptor object representing im2col memory region + * + * Creates a descriptor for Tensor Memory Access (TMA) object specified + * by the parameters describing a im2col memory layout and returns it in \p tensorMap. + * + * Tensor map objects are only supported on devices of compute capability 9.0 or higher. + * Additionally, a tensor map object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The parameters passed are bound to the following requirements: + * + * - \p tensorMap address must be aligned to 64 bytes. + * + * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as: + * \code + typedef enum CUtensorMapDataType_enum { + CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0, // 1 byte + CU_TENSOR_MAP_DATA_TYPE_UINT16, // 2 bytes + CU_TENSOR_MAP_DATA_TYPE_UINT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_INT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_UINT64, // 8 bytes + CU_TENSOR_MAP_DATA_TYPE_INT64, // 8 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT16, // 2 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT64, // 8 bytes + CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, // 2 bytes + CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32, // 4 bytes + CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ // 4 bytes + } CUtensorMapDataType; + * \endcode + * + * - \p tensorRank must be one of dimensions 3, 4, or 5. + * + * - \p globalAddress, which specifies the starting address of the memory region described, must be 32 byte aligned when \p interleave is + * ::CU_TENSOR_MAP_INTERLEAVE_32B and 16 byte aligned otherwise. + * + * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or + * equal to 2^32. + * + * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a + * multiple of 16 and less than 2^40. Additionally, the stride must be a multiple of 32 when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B. + * Each following dimension specified includes previous dimension stride: + * \code + globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0]; + for (i = 1; i < tensorRank - 1; i++) + globalStrides[i] = globalStrides[i – 1] * globalStrides[i] + padding[i]; + assert(globalStrides[i] >= globalDim[i]); + * \endcode + * + * - \p pixelBoxLowerCorner array specifies the coordinate offsets {D, H, W} of the bounding box from top/left/front corner. The number of + * offsets and their precision depends on the tensor dimensionality: + * - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported. + * - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported. + * - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported. + * + * - \p pixelBoxUpperCorner array specifies the coordinate offsets {D, H, W} of the bounding box from bottom/right/back corner. The number of + * offsets and their precision depends on the tensor dimensionality: + * - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported. + * - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported. + * - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported. + * The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area. + * + * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256. + * + * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or + * equal to 1024. + * + * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less + * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since + * TMA doesn’t support the stride for dimension zero. + * When all elemets of \p elementStrides array is one, \p boxDim specifies the number of elements to load. However, if the \p elementStrides[i] + * is not equal to one, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension. To load N elements along + * i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i]. + * + * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as: + * \code + typedef enum CUtensorMapInterleave_enum { + CU_TENSOR_MAP_INTERLEAVE_NONE = 0, + CU_TENSOR_MAP_INTERLEAVE_16B, + CU_TENSOR_MAP_INTERLEAVE_32B + } CUtensorMapInterleave; + * \endcode + * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16 + * uses 32 bytes. + * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension + * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size. + * - CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32. + * - CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64. + * - CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= 128. + * + * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as: + * \code + typedef enum CUtensorMapSwizzle_enum { + CU_TENSOR_MAP_SWIZZLE_NONE = 0, + CU_TENSOR_MAP_SWIZZLE_32B, + CU_TENSOR_MAP_SWIZZLE_64B, + CU_TENSOR_MAP_SWIZZLE_128B + } CUtensorMapSwizzle; + * \endcode + * Data is organized in specific order in global memory; however, it may not match the order in which data are accessed by application in + * the shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this + * problem, data can be loaded to shard memory with shuffling across shared memory banks. + * Note that it’s expected that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle should be ::CU_TENSOR_MAP_SWIZZLE_32B mode. + * Other interleave modes can have any swizzling patterns. + * + * - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of + * type ::CUtensorMapL2promotion, which is defined as: + * \code + typedef enum CUtensorMapL2promotion_enum { + CU_TENSOR_MAP_L2_PROMOTION_NONE = 0, + CU_TENSOR_MAP_L2_PROMOTION_L2_64B, + CU_TENSOR_MAP_L2_PROMOTION_L2_128B, + CU_TENSOR_MAP_L2_PROMOTION_L2_256B + } CUtensorMapL2promotion; + * \endcode + * + * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type + * ::CUtensorMapFloatOOBfill which is defined as: + * \code + typedef enum CUtensorMapFloatOOBfill_enum { + CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0, + CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA + } CUtensorMapFloatOOBfill; + * \endcode + * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating data type. + * + * \param tensorMap - Tensor map object to create + * \param tensorDataType - Tensor data type + * \param tensorRank - Dimensionality of tensor, needs to be at least of dimension 3 + * \param globalAddress - Starting address of memory region described by tensor + * \param globalDim - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions + * \param globalStrides - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions + * \param pixelBoxLowerCorner - Array containing DHW dimentions of lower box corner + * \param pixelBoxUpperCorner - Array containing DHW dimentions of upper box corner + * \param channelsPerPixel - Number of channels per pixel + * \param pixelsPerColumn - Number of pixels per column + * \param elementStrides - Array containing traversal stride in each of the \p tensorRank dimensions + * \param interleave - Type of interleaved layout the tensor addresses + * \param swizzle - Bank swizzling pattern inside shared memory + * \param l2Promotion - L2 promotion size + * \param oobFill - Indicate whether zero or special NaN constant must be used to fill out-of-bound elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTensorMapEncodeTiled, + * ::cuTensorMapReplaceAddress + */ +CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill); + +/** + * \brief Modify an existing tensor map descriptor with an updated global address + * + * Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with + * an updated \p globalAddress. + * + * Tensor map objects are only supported on devices of compute capability 9.0 or higher. + * Additionally, a tensor map object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param tensorMap - Tensor map object to modify + * \param globalAddress - Starting address of memory region described by tensor, must follow previous alignment requirements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTensorMapEncodeTiled, + * ::cuTensorMapEncodeIm2col + */ +CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap, void *globalAddress); + +/** @} */ +/* END CUDA_TENSOR_MEMORY */ + +/** + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access + * + * ___MANBRIEF___ direct peer context memory access functions of the low-level + * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the direct peer context memory access functions + * of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of + * directly accessing memory from contexts on \p peerDev and 0 otherwise. + * If direct access of \p peerDev from \p dev is possible, then access may be + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param dev - Device from which allocations on \p peerDev are to + * be directly accessed. + * \param peerDev - Device on which the allocations to be directly accessed + * by \p dev reside. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceCanAccessPeer + */ +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); + +/** + * \brief Enables direct access to memory allocations in a peer context. + * + * If both the current context and \p peerContext are on devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same + * major compute capability, then on success all allocations from \p peerContext will + * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional + * details. + * + * Note that access granted by this call is unidirectional and that in order to access + * memory from the current context in \p peerContext, a separate symmetric call + * to ::cuCtxEnablePeerAccess() is required. + * + * Note that there are both device-wide and system-wide limitations per system + * configuration, as noted in the CUDA Programming Guide under the section + * "Peer-to-Peer Memory Access". + * + * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates + * that the ::CUdevice of the current context cannot directly access memory + * from the ::CUdevice of \p peerContext. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of + * \p peerContext from the current context has already been enabled. + * + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible + * because hardware resources required for peer access have been exhausted. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext + * is not a valid context, or if the current context is \p peerContext. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. + * + * \param peerContext - Peer context to enable direct access to from the current context + * \param Flags - Reserved for future use and must be set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceEnablePeerAccess + */ +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); + +/** + * \brief Disables direct access to memory allocations in a peer context and + * unregisters any registered allocations. + * + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has + * not yet been enabled from \p peerContext to the current context. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if + * \p peerContext is not a valid context. + * + * \param peerContext - Peer context to disable direct access to + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxEnablePeerAccess, + * ::cudaDeviceDisablePeerAccess + */ +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); + +/** + * \brief Queries attributes of the link between two devices. + * + * Returns in \p *value the value of the requested attribute \p attrib of the + * link between \p srcDevice and \p dstDevice. The supported attributes are: + * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the + * performance of the link between two devices. + * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. + * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over + * the link are supported. + * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can + * be accessed over the link. + * + * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid + * or if they represent the same device. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is + * a null pointer. + * + * \param value - Returned value of the requested attribute + * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. + * \param srcDevice - The source device of the target link. + * \param dstDevice - The destination device of the target link. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaDeviceGetP2PAttribute + */ +CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); + +/** @} */ /* END CUDA_PEER_ACCESS */ + +/** + * \defgroup CUDA_GRAPHICS Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsD3D9RegisterResource, + * ::cuGraphicsD3D10RegisterResource, + * ::cuGraphicsD3D11RegisterResource, + * ::cuGraphicsGLRegisterBuffer, + * ::cuGraphicsGLRegisterImage, + * ::cudaGraphicsUnregisterResource + */ +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pArray - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::CUarray_cubemap_face for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsSubResourceGetMappedArray + */ +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics + * resource \p resource. The value set in \p *pMipmappedArray may change every time + * that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via a mipmapped array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsResourceGetMappedMipmappedArray + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); + +/** + * \brief Get a device pointer through which to access a mapped graphics resource. + * + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p pPointer may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * * + * \param pDevPtr - Returned pointer through which \p resource may be accessed + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then + * ::CUDA_ERROR_ALREADY_MAPPED is returned. + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsResourceSetMapFlags + */ +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA usage + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsUnmapResources, + * ::cudaGraphicsMapResources + */ +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cuGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsUnmapResources + */ +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** @} */ /* END CUDA_GRAPHICS */ + +/** + * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access + * + * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the driver entry point access functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the requested driver API function pointer + * + * Returns in \p **pfn the address of the CUDA driver function for the requested + * CUDA version and flags. + * + * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 + * should be specified as 11020. For a requested driver symbol, if the specified + * CUDA version is greater than or equal to the CUDA version in which the driver symbol + * was introduced, this API will return the function pointer to the corresponding + * versioned function. + * + * The pointer returned by the API should be cast to a function pointer matching the + * requested driver function's definition in the API header file. The function pointer + * typedef can be picked up from the corresponding typedefs header file. For example, + * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. + * + * The API will return ::CUDA_SUCCESS and set the returned \p pfn to NULL if the + * requested driver function is not supported on the platform, no ABI + * compatible driver function exists for the specified \p cudaVersion or if the + * driver symbol is invalid. + * + * It will also set the optional \p symbolStatus to one of the values in + * ::CUdriverProcAddressQueryResult with the following meanings: + * - ::CU_GET_PROC_ADDRESS_SUCCESS - The requested symbol was succesfully found based + * on input arguments and \p pfn is valid + * - ::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND - The requested symbol was not found + * - ::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT - The requested symbol was found but is + * not supported by cudaVersion specified + * + * The requested flags can be: + * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to + * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with + * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM + * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. + * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols + * that match the requested driver symbol name except the corresponding per-thread versions. + * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all + * driver symbols that match the requested driver symbol name including the per-thread + * versions. If a per-thread version is not found, the API will return the legacy version + * of the driver function. + * + * \param symbol - The base name of the driver API function to look for. As an example, + * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and + * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. + * \param pfn - Location to return the function pointer to the requested driver function + * \param cudaVersion - The CUDA version to look for the requested driver symbol + * \param flags - Flags to specify search options. + * \param symbolStatus - Optional location to store the status of the search for + * \p symbol based on \p cudaVersion. See ::CUdriverProcAddressQueryResult + * for possible values. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_version_mixing + * + * \sa + * ::cudaGetDriverEntryPoint + */ +CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolStatus); + +/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ + +/** + * \defgroup CUDA_COREDUMP Coredump Attributes Control API + * + * ___MANBRIEF___ coredump attribute control functions for the low-level CUDA API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the coredump attribute control functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * Flags for choosing a coredump attribute to get/set + */ +typedef enum CUcoredumpSettings_enum { + CU_COREDUMP_ENABLE_ON_EXCEPTION = 1, + CU_COREDUMP_TRIGGER_HOST, + CU_COREDUMP_LIGHTWEIGHT, + CU_COREDUMP_ENABLE_USER_TRIGGER, + CU_COREDUMP_FILE, + CU_COREDUMP_PIPE, + CU_COREDUMP_MAX +} CUcoredumpSettings; + +/** + * \brief Allows caller to fetch a coredump attribute value for the current context + * + * Returns in \p *value the requested value specified by \p attrib. It is up to the caller + * to ensure that the data type and size of \p *value matches the request. + * + * If the caller calls this function with \p *value equal to NULL, the size of the memory + * region (in bytes) expected for \p attrib will be placed in \p size. + * + * The supported attributes are: + * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + * The default value is ::false unless set to ::true globally or locally, or the + * CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation. + * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + * also create a coredump. The default value is ::true unless set to ::false globally or + * or locally. + * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + * will not have a dump of GPU memory or non-reloc ELF images. The default value is + * ::false unless set to ::true globally or locally. + * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + * created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default + * value is ::false unless set to ::true globally or locally. + * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + * any coredumps generated by this context will be written. The default value is + * ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + * the CUDA applications and ::PID is the process ID of the CUDA application. + * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + * that will be monitored if user-triggered coredumps are enabled. The default value is + * ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + * the CUDA application and ::PID is the process ID of the CUDA application. + * + * \param attrib - The enum defining which value to fetch. + * \param value - void* containing the requested data. + * \param size - The size of the memory region \p value points to. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * + * \sa + * ::cuCoredumpGetAttributeGlobal, + * ::cuCoredumpSetAttribute, + * ::cuCoredumpSetAttributeGlobal + */ +CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value, size_t *size); + +/** + * \brief Allows caller to fetch a coredump attribute value for the entire application + * + * Returns in \p *value the requested value specified by \p attrib. It is up to the caller + * to ensure that the data type and size of \p *value matches the request. + * + * If the caller calls this function with \p *value equal to NULL, the size of the memory + * region (in bytes) expected for \p attrib will be placed in \p size. + * + * The supported attributes are: + * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + * The default value is ::false. + * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + * also create a coredump. The default value is ::true. + * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + * will not have a dump of GPU memory or non-reloc ELF images. The default value is + * ::false. + * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + * created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default + * value is ::false. + * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + * any coredumps generated by this context will be written. The default value is + * ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + * the CUDA applications and ::PID is the process ID of the CUDA application. + * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + * that will be monitored if user-triggered coredumps are enabled. The default value is + * ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + * the CUDA application and ::PID is the process ID of the CUDA application. + * + * \param attrib - The enum defining which value to fetch. + * \param value - void* containing the requested data. + * \param size - The size of the memory region \p value points to. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuCoredumpGetAttribute, + * ::cuCoredumpSetAttribute, + * ::cuCoredumpSetAttributeGlobal + */ +CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *value, size_t *size); + +/** + * \brief Allows caller to set a coredump attribute value for the current context + * + * This function should be considered an alternate interface to the CUDA-GDB environment + * variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump + * + * An important design decision to note is that any coredump environment variable values + * set before CUDA initializes will take permanent precedence over any values set with this + * this function. This decision was made to ensure no change in behavior for any users that + * may be currently using these variables to get coredumps. + * + * \p *value shall contain the requested value specified by \p set. It is up to the caller + * to ensure that the data type and size of \p *value matches the request. + * + * If the caller calls this function with \p *value equal to NULL, the size of the memory + * region (in bytes) expected for \p set will be placed in \p size. + * + * ::CU_COREDUMP_ENABLE_USER_TRIGGER and ::CU_COREDUMP_PIPE cannot be set on a per-context basis. + * + * The supported attributes are: + * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + * The default value is ::false. + * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + * also create a coredump. The default value is ::true. + * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + * will not have a dump of GPU memory or non-reloc ELF images. The default value is + * ::false. + * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + * any coredumps generated by this context will be written. The default value is + * ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + * the CUDA applications and ::PID is the process ID of the CUDA application. + * + * \param attrib - The enum defining which value to set. + * \param value - void* containing the requested data. + * \param size - The size of the memory region \p value points to. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * + * \sa + * ::cuCoredumpGetAttributeGlobal, + * ::cuCoredumpGetAttribute, + * ::cuCoredumpSetAttributeGlobal + */ +CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value, size_t *size); + +/** + * \brief Allows caller to set a coredump attribute value globally + * + * This function should be considered an alternate interface to the CUDA-GDB environment + * variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump + * + * An important design decision to note is that any coredump environment variable values + * set before CUDA initializes will take permanent precedence over any values set with this + * this function. This decision was made to ensure no change in behavior for any users that + * may be currently using these variables to get coredumps. + * + * \p *value shall contain the requested value specified by \p set. It is up to the caller + * to ensure that the data type and size of \p *value matches the request. + * + * If the caller calls this function with \p *value equal to NULL, the size of the memory + * region (in bytes) expected for \p set will be placed in \p size. + * + * The supported attributes are: + * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + * this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + * The default value is ::false. + * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + * also create a coredump. The default value is ::true. + * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + * will not have a dump of GPU memory or non-reloc ELF images. The default value is + * ::false. + * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + * created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default + * value is ::false. + * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + * any coredumps generated by this context will be written. The default value is + * ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + * the CUDA applications and ::PID is the process ID of the CUDA application. + * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + * that will be monitored if user-triggered coredumps are enabled. This value may not be + * changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default + * value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine + * running the CUDA application and ::PID is the process ID of the CUDA application. + * + * \param attrib - The enum defining which value to set. + * \param value - void* containing the requested data. + * \param size - The size of the memory region \p value points to. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_PERMITTED + * + * \sa + * ::cuCoredumpGetAttribute, + * ::cuCoredumpGetAttributeGlobal, + * ::cuCoredumpSetAttribute + */ +CUresult CUDAAPI cuCoredumpSetAttributeGlobal(CUcoredumpSettings attrib, void *value, size_t *size); + +/** @} */ /* END CUDA_COREDUMP */ + +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuMemHostRegister + #undef cuGraphicsResourceSetMapFlags + #undef cuLinkCreate + #undef cuLinkAddData + #undef cuLinkAddFile + #undef cuDeviceTotalMem + #undef cuCtxCreate + #undef cuModuleGetGlobal + #undef cuMemGetInfo + #undef cuMemAlloc + #undef cuMemAllocPitch + #undef cuMemFree + #undef cuMemGetAddressRange + #undef cuMemAllocHost + #undef cuMemHostGetDevicePointer + #undef cuMemcpyHtoD + #undef cuMemcpyDtoH + #undef cuMemcpyDtoD + #undef cuMemcpyDtoA + #undef cuMemcpyAtoD + #undef cuMemcpyHtoA + #undef cuMemcpyAtoH + #undef cuMemcpyAtoA + #undef cuMemcpyHtoAAsync + #undef cuMemcpyAtoHAsync + #undef cuMemcpy2D + #undef cuMemcpy2DUnaligned + #undef cuMemcpy3D + #undef cuMemcpyHtoDAsync + #undef cuMemcpyDtoHAsync + #undef cuMemcpyDtoDAsync + #undef cuMemcpy2DAsync + #undef cuMemcpy3DAsync + #undef cuMemsetD8 + #undef cuMemsetD16 + #undef cuMemsetD32 + #undef cuMemsetD2D8 + #undef cuMemsetD2D16 + #undef cuMemsetD2D32 + #undef cuArrayCreate + #undef cuArrayGetDescriptor + #undef cuArray3DCreate + #undef cuArray3DGetDescriptor + #undef cuTexRefSetAddress + #undef cuTexRefSetAddress2D + #undef cuTexRefGetAddress + #undef cuGraphicsResourceGetMappedPointer + #undef cuCtxDestroy + #undef cuCtxPopCurrent + #undef cuCtxPushCurrent + #undef cuStreamDestroy + #undef cuEventDestroy + #undef cuMemcpy + #undef cuMemcpyAsync + #undef cuMemcpyPeer + #undef cuMemcpyPeerAsync + #undef cuMemcpy3DPeer + #undef cuMemcpy3DPeerAsync + #undef cuMemsetD8Async + #undef cuMemsetD16Async + #undef cuMemsetD32Async + #undef cuMemsetD2D8Async + #undef cuMemsetD2D16Async + #undef cuMemsetD2D32Async + #undef cuStreamGetPriority + #undef cuStreamGetId + #undef cuStreamGetFlags + #undef cuStreamGetCtx + #undef cuStreamWaitEvent + #undef cuStreamAddCallback + #undef cuStreamAttachMemAsync + #undef cuStreamQuery + #undef cuStreamSynchronize + #undef cuEventRecord + #undef cuEventRecordWithFlags + #undef cuLaunchKernel + #undef cuLaunchKernelEx + #undef cuLaunchHostFunc + #undef cuGraphicsMapResources + #undef cuGraphicsUnmapResources + #undef cuStreamWriteValue32 + #undef cuStreamWaitValue32 + #undef cuStreamWriteValue64 + #undef cuStreamWaitValue64 + #undef cuStreamBatchMemOp + #undef cuStreamWriteValue32_v2 + #undef cuStreamWaitValue32_v2 + #undef cuStreamWriteValue64_v2 + #undef cuStreamWaitValue64_v2 + #undef cuStreamBatchMemOp_v2 + #undef cuMemPrefetchAsync + #undef cuLaunchCooperativeKernel + #undef cuSignalExternalSemaphoresAsync + #undef cuWaitExternalSemaphoresAsync + #undef cuStreamBeginCapture + #undef cuStreamEndCapture + #undef cuStreamIsCapturing + #undef cuStreamGetCaptureInfo + #undef cuStreamGetCaptureInfo_v2 + #undef cuGraphInstantiateWithParams + #undef cuGraphExecUpdate + #undef cuGraphUpload + #undef cuGraphLaunch + #undef cuDevicePrimaryCtxRelease + #undef cuDevicePrimaryCtxReset + #undef cuDevicePrimaryCtxSetFlags + #undef cuIpcOpenMemHandle + #undef cuStreamCopyAttributes + #undef cuStreamSetAttribute + #undef cuStreamGetAttribute + #undef cuGraphInstantiate + #undef cuGraphAddKernelNode + #undef cuGraphKernelNodeGetParams + #undef cuGraphKernelNodeSetParams + #undef cuGraphExecKernelNodeSetParams + #undef cuMemMapArrayAsync + #undef cuMemFreeAsync + #undef cuMemAllocAsync + #undef cuMemAllocFromPoolAsync + #undef cuStreamUpdateCaptureDependencies + #undef cuGetProcAddress + + CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); + + typedef unsigned int CUdeviceptr_v1; + + typedef struct CUDA_MEMCPY2D_v1_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr_v1 srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ + } CUDA_MEMCPY2D_v1; + + typedef struct CUDA_MEMCPY3D_v1_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr_v1 srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ + } CUDA_MEMCPY3D_v1; + + typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st + { + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + } CUDA_ARRAY_DESCRIPTOR_v1; + + typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st + { + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ + } CUDA_ARRAY3D_DESCRIPTOR_v1; + + CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); + CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); + CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); + CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); + CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); + CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); + CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); + CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); + CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); + CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); + CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); + CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); + CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); + CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); + CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); + CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); + CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); + CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); + CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); + CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); + + CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); + CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + CUresult CUDAAPI cuStreamDestroy(CUstream hStream); + CUresult CUDAAPI cuEventDestroy(CUevent hEvent); + CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + + CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); + CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); + CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); + CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + + CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + + CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId); + CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + CUresult CUDAAPI cuStreamQuery(CUstream hStream); + CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); + CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra); + CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + + CUresult CUDAAPI cuStreamWriteValue32_ptsz(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32_ptsz(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64_ptsz(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64_ptsz(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp_ptsz(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + + CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); + CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); + CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); + CUresult CUDAAPI cuStreamGetCaptureInfo_ptsz(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); + CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); + CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); + CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); + CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); + CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); + CUresult CUDAAPI cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams); + CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); + CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); + CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); + CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); + CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); + CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); + + CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + CUresult CUDAAPI cuGraphInstantiate_v2(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + + CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); + + CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); + CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); + CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); + + CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); + CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); + +#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) +static inline CUresult cuGetProcAddress_v2_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolStatus) { + const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| + CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); + if ((flags & procAddressMask) == 0) { + flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; + } + return cuGetProcAddress_v2(symbol, funcPtr, driverVersion, flags, symbolStatus); +} +#define cuGetProcAddress_v2 cuGetProcAddress_v2_ptsz +#endif + +#ifdef __cplusplus +} +#endif + +#if defined(__GNUC__) + #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) + #pragma GCC visibility pop + #endif +#endif + +#undef __CUDA_DEPRECATED + +#endif /* __cuda_cuda_h__ */ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fbc573515746ace830f6df8aa7c931e3481c752 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/__pycache__/__init__.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/build_extern.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/build_extern.py new file mode 100644 index 0000000000000000000000000000000000000000..6f00e8192593930018b55ac0daaaf6db69a379c0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/build_extern.py @@ -0,0 +1,376 @@ +import argparse +import subprocess +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + + +class Symbol: + _name: str + _op_name: str + _ret_type: str + _arg_names: List[str] + _arg_types: List[str] + + def __init__( + self, + name: str, + op_name: str, + ret_type: str, + arg_names: List[str], + arg_types: List[str], + ) -> None: + ''' + A symbol is a function declaration. + :param name: name of the symbol + :param op_name: name of the operation + :param ret_type: return type of the operation + :param arg_names: names of the arguments + :param arg_types: types of the arguments + ''' + self._name = name + self._op_name = op_name + self._ret_type = ret_type + self._arg_names = list(arg_names) + self._arg_types = list(arg_types) + + @property + def name(self) -> str: + return self._name + + @property + def op_name(self) -> str: + return self._op_name + + @property + def ret_type(self) -> str: + return self._ret_type + + @property + def arg_names(self) -> List[str]: + return self._arg_names + + @property + def arg_types(self) -> List[str]: + return self._arg_types + + +def convert_type(type_str) -> Optional[str]: + if type_str == "i32": + return "int32" + elif type_str == "u32": + return "uint32" + elif type_str == "i64": + return "int64" + elif type_str == "u64": + return "uint64" + elif type_str == "float": + return "fp32" + elif type_str == "double": + return "fp64" + else: + # ignore other types, such as pointer types + return None + + +def to_unsigned(type_str) -> str: + if type_str == "int32": + return "uint32" + elif type_str == "int64": + return "uint64" + else: + return type_str + + +class ExternLibrary(ABC): + _name: str + _path: str + _symbols: Dict[str, Symbol] + _format: bool + _grouping: bool + + def __init__( + self, + name: str, + path: str, + format: bool = True, + grouping: bool = True, + ) -> None: + ''' + Abstract class for extern library. + :param name: name of the library + :param path: path of the library + :param format: whether to format the generated stub file + ''' + self._name = name + self._path = path + self._symbols = {} + self._format = format + self._grouping = grouping + + @property + def name(self) -> str: + return self._name + + @property + def path(self) -> str: + return self._path + + @property + def symbols(self) -> Dict[str, Symbol]: + return self._symbols + + @property + def grouping(self) -> bool: + return self._grouping + + @abstractmethod + def parse_symbols(self, input_file) -> None: + pass + + @abstractmethod + def _output_stubs(self) -> str: + pass + + def generate_stub_file(self, output_dir) -> None: + file_str = self._output_stubs() + if file_str is None or len(file_str) == 0: + raise Exception("file_str is empty") + + output_file = f"{output_dir}/{self._name}.py" + with open(output_file, "w") as f: + f.write(file_str) + f.close() + if self._format: + subprocess.Popen(["autopep8", "-a", "-r", "-i", output_file], stdout=subprocess.PIPE).communicate() + subprocess.Popen(["isort", output_file], stdout=subprocess.PIPE).communicate() + + +class Libdevice(ExternLibrary): + _symbol_groups: Dict[str, List[Symbol]] + + def __init__(self, path) -> None: + ''' + Constructor for Libdevice. + :param path: path of the libdevice library + ''' + super().__init__("libdevice", path) + self._symbol_groups = {} + self.is_pure = True + + @staticmethod + def _extract_symbol(line) -> Optional[Symbol]: + # Extract symbols from line in the following format: + # "define [internal] @(,)" + entries = line.split("@") + ret_str = entries[0] + func_str = entries[1] + # Get ret_type, skip internal symbols + ret_strs = ret_str.split() + if ret_strs[1] == "internal": + return None + ret_type = convert_type(ret_strs[1]) + if ret_type is None: + return None + # Get function name + func_strs = func_str.split("(") + func_name = func_strs[0].replace("@", "") + op_name = func_name.replace("__nv_", "") + if 'ieee' in op_name: + return None + # Get arg_types + arg_strs = func_strs[1].split(",") + arg_types = [] + arg_names = [] + for i, arg_str in enumerate(arg_strs): + arg_type = convert_type(arg_str.split()[0]) + if arg_type is None: + return None + arg_name = 'arg' + str(i) + arg_types.append(arg_type) + arg_names.append(arg_name) + if op_name == "sad": + # Special case for sad, where the last argument is an unsigned int + arg_types[-1] = to_unsigned(arg_types[-1]) + elif op_name.startswith("u"): + # LLVM does not differentiate between signed and unsigned integer type. + # We have to convert the types to unsigned + ret_type = to_unsigned(ret_type) + for i, arg_type in enumerate(arg_types): + arg_types[i] = to_unsigned(arg_type) + return Symbol(func_name, op_name, ret_type, arg_names, arg_types) + + def _group_symbols(self) -> None: + symbol_set = {} + for symbol in self._symbols.values(): + op_name = symbol.op_name + symbol_set[op_name] = symbol + + # Group functions together by renaming. + renaming = { + 'llabs': 'abs', 'acosf': 'acos', 'acoshf': 'acosh', 'dadd_rd': 'add_rd', 'fadd_rd': 'add_rd', 'dadd_rn': + 'add_rn', 'fadd_rn': 'add_rn', 'dadd_ru': 'add_ru', 'fadd_ru': 'add_ru', 'dadd_rz': 'add_rz', 'fadd_rz': + 'add_rz', 'asinf': 'asin', 'asinhf': 'asinh', 'atanf': 'atan', 'atan2f': 'atan2', 'atanhf': 'atanh', + 'brevll': 'brev', 'cbrtf': 'cbrt', 'ceilf': 'ceil', 'clzll': 'clz', 'copysignf': 'copysign', 'cosf': 'cos', + 'coshf': 'cosh', 'cospif': 'cospi', 'cyl_bessel_i0f': 'cyl_bessel_i0', 'cyl_bessel_i1f': 'cyl_bessel_i1', + 'fdiv_rd': 'div_rd', 'ddiv_rd': 'div_rd', 'fdiv_rn': 'div_rn', 'ddiv_rn': 'div_rn', 'fdiv_ru': 'div_ru', + 'ddiv_ru': 'div_ru', 'fdiv_rz': 'div_rz', 'ddiv_rz': 'div_rz', 'erff': 'erf', 'erfcf': 'erfc', 'erfcinvf': + 'erfcinv', 'erfcxf': 'erfcx', 'erfinvf': 'erfinv', 'expf': 'exp', 'exp10f': 'exp10', 'exp2f': 'exp2', + 'expm1f': 'expm1', 'fabsf': 'abs', 'fabs': 'abs', 'fast_fdividef': 'fast_dividef', 'fdimf': 'fdim', 'ffsll': + 'ffs', 'floorf': 'floor', 'fmaf': 'fma', 'fmaf_rd': 'fma_rd', 'fmaf_rn': 'fma_rn', 'fmaf_ru': 'fma_ru', + 'fmaf_rz': 'fma_rz', 'fmodf': 'fmod', 'uhadd': 'hadd', 'hypotf': 'hypot', 'ilogbf': 'ilogb', 'isinff': + 'isinf', 'isinfd': 'isinf', 'isnanf': 'isnan', 'isnand': 'isnan', 'j0f': 'j0', 'j1f': 'j1', 'jnf': 'jn', + 'ldexpf': 'ldexp', 'lgammaf': 'lgamma', 'llrintf': 'llrint', 'llroundf': 'llround', 'logf': 'log', 'log10f': + 'log10', 'log1pf': 'log1p', 'log2f': 'log2', 'logbf': 'logb', 'umax': 'max', 'llmax': 'max', 'ullmax': + 'max', 'fmaxf': 'max', 'fmax': 'max', 'umin': 'min', 'llmin': 'min', 'ullmin': 'min', 'fminf': 'min', + 'fmin': 'min', 'dmul_rd': 'mul_rd', 'fmul_rd': 'mul_rd', 'dmul_rn': 'mul_rn', 'fmul_rn': 'mul_rn', + 'dmul_ru': 'mul_ru', 'fmul_ru': 'mul_ru', 'dmul_rz': 'mul_rz', 'fmul_rz': 'mul_rz', 'umul24': 'mul24', + 'umulhi': 'mulhi', 'mul64hi': 'mulhi', 'umul64hi': 'mulhi', 'nearbyintf': 'nearbyint', 'nextafterf': + 'nextafter', 'norm3df': 'norm3d', 'norm4df': 'norm4d', 'normcdff': 'normcdf', 'normcdfinvf': 'normcdfinv', + 'popcll': 'popc', 'powif': 'pow', 'powi': 'pow', 'powf': 'pow', 'rcbrtf': 'rcbrt', 'frcp_rd': 'rcp_rd', + 'drcp_rd': 'rcp_rd', 'frcp_rn': 'rcp_rn', 'drcp_rn': 'rcp_rn', 'frcp_ru': 'rcp_ru', 'drcp_ru': 'rcp_ru', + 'frcp_rz': 'rcp_rz', 'drcp_rz': 'rcp_rz', 'remainderf': 'remainder', 'urhadd': 'rhadd', 'rhypotf': 'rhypot', + 'rintf': 'rint', 'rnorm3df': 'rnorm3d', 'rnorm4df': 'rnorm4d', 'roundf': 'round', 'rsqrtf': 'rsqrt', + 'frsqrt_rn': 'rsqrt_rn', 'usad': 'sad', 'scalbnf': 'scalbn', 'signbitf': 'signbit', 'signbitd': 'signbit', + 'sinf': 'sin', 'sinhf': 'sinh', 'sinpif': 'sinpi', 'sqrtf': 'sqrt', 'fsqrt_rd': 'sqrt_rd', 'dsqrt_rd': + 'sqrt_rd', 'fsqrt_rn': 'sqrt_rn', 'dsqrt_rn': 'sqrt_rn', 'fsqrt_ru': 'sqrt_ru', 'dsqrt_ru': 'sqrt_ru', + 'fsqrt_rz': 'sqrt_rz', 'dsqrt_rz': 'sqrt_rz', 'fsub_rd': 'sub_rd', 'dsub_rd': 'sub_rd', 'fsub_rn': 'sub_rn', + 'dsub_rn': 'sub_rn', 'fsub_ru': 'sub_ru', 'dsub_ru': 'sub_ru', 'fsub_rz': 'sub_rz', 'dsub_rz': 'sub_rz', + 'tanf': 'tan', 'tanhf': 'tanh', 'tgammaf': 'tgamma', 'truncf': 'trunc', 'y0f': 'y0', 'y1f': 'y1', 'ynf': + 'yn' + } + + for symbol in self._symbols.values(): + op_name = symbol.op_name + if op_name in renaming: + op_name = renaming[op_name] + symbol._op_name = op_name + if op_name in self._symbol_groups: + self._symbol_groups[op_name].append(symbol) + else: + self._symbol_groups[op_name] = [symbol] + + def parse_symbols(self, input_file) -> None: + if len(self.symbols) > 0: + return + output = subprocess.check_output(["grep", "define", input_file]).decode().splitlines() + for line in output: + symbol = self._extract_symbol(line) + if symbol is None: + continue + self._symbols[symbol.name] = symbol + + self._group_symbols() + + def _output_stubs(self) -> str: + # Generate python functions in the following format: + # @extern.extern + # def (, _builder=None): + # arg_type_symbol_dict = {[arg_type]: {(symbol, ret_type)}} + # return core.extern_elementwise("libdevice", , , , _builder) + import_str = "from . import core\n" + import_str += "import os\n" + import_str += "import functools\n" + + header_str = "" + header_str += "@functools.lru_cache()\n" + header_str += "def libdevice_path():\n" + header_str += " import torch\n" + header_str += " third_party_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), \"..\", \"third_party\")\n" + header_str += " if torch.version.hip is None:\n" + header_str += " default = os.path.join(third_party_dir, \"cuda\", \"lib\", \"libdevice.10.bc\")\n" + header_str += " else:\n" + header_str += " default = ''\n" + header_str += " return os.getenv(\"TRITON_LIBDEVICE_PATH\", default)\n" + func_str = "" + for symbols in self._symbol_groups.values(): + func_str += "@core.extern\n" + func_name_str = f"def {symbols[0].op_name}(" + for arg_name in symbols[0].arg_names: + func_name_str += f"{arg_name}, " + func_name_str += "_builder=None):\n" + + return_str = f"\treturn core.extern_elementwise(\"{self._name}\", libdevice_path(), [" + for arg_name in symbols[0].arg_names: + return_str += f"{arg_name}, " + return_str += "], \n" + + arg_type_symbol_dict_str = "{" + for symbol in symbols: + arg_type_symbol_dict_str += "(" + for arg_type in symbol.arg_types: + arg_type_symbol_dict_str += f'core.dtype("{arg_type}"),' + ret_type = f'core.dtype("{symbol.ret_type}")' + arg_type_symbol_dict_str += "): (\"" + symbol.name + "\", " + ret_type + "),\n" + arg_type_symbol_dict_str += "}" + + return_str += arg_type_symbol_dict_str + return_str += f", is_pure={self.is_pure}" + return_str += ", _builder=_builder)\n" + + func_str += func_name_str + return_str + "\n" + file_str = import_str + header_str + func_str + + return file_str + + +class LLVMDisassembler: + _path: str + _ll_file: str + + def __init__(self, path) -> None: + ''' + Invoke llvm-dis to disassemble the given file. + :param path: path to llvm-dis + ''' + self._path = path + self._ll_file = "/tmp/extern_lib.ll" + + def disasm(self, lib_path: str) -> None: + subprocess.Popen([self._path, lib_path, "-o", self.ll_file], stdout=subprocess.PIPE).communicate() + + @property + def ll_file(self) -> str: + return self._ll_file + + @property + def path(self) -> str: + return self._path + + +extern_libs = ["libdevice"] + + +def build( + llvm_dis_path: str, + lib_path: str, + lib_name: str, + output_dir: str, +) -> None: + ''' + Interface function to build the library file. + :param llvm_dis_path: path to the llvm-dis binary + :param lib_path: path to the external library file + :param lib_name: name of the library + :param output_dir: path to the output directory + ''' + if lib_name == "libdevice": + extern_lib = Libdevice(lib_path) + else: + raise Exception(f"Unknown extern library: {lib_name}") + + llvm_disassembler = LLVMDisassembler(llvm_dis_path) + llvm_disassembler.disasm(lib_path) + + extern_lib.parse_symbols(llvm_disassembler.ll_file) + extern_lib.generate_stub_file(output_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--llvm-dis", dest="llvm_dis_path", help="Path to llvm-dis", default="llvm-dis") + parser.add_argument("--lib-path", dest="lib_path", help="Path to the extern library") + parser.add_argument("--lib-name", dest="lib_name", help="Name of the extern library") + parser.add_argument("--output", dest="output_dir", help="Output file path", default="/tmp/") + args = parser.parse_args() + + build(args.llvm_dis_path, args.lib_path, args.lib_name, args.output_dir) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/compile.c b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/compile.c new file mode 100644 index 0000000000000000000000000000000000000000..971bf61912a752465c83a0b17ae131bee6f5cc74 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/compile.c @@ -0,0 +1,67 @@ +/* clang-format off */ +#include +#include +#include +#include +#include + + +// helpers to check for cuda errors +#define CUDA_CHECK(ans) {{\ + gpuAssert((ans), __FILE__, __LINE__);\ + }}\ + +static inline void gpuAssert(CUresult code, const char *file, int line) {{ + if (code != CUDA_SUCCESS) {{ + const char *prefix = "Triton Error [CUDA]: "; + const char *str; + cuGetErrorString(code, &str); + char err[1024] = {{0}}; + strcat(err, prefix); + strcat(err, str); + printf("%s\\n", err); + exit(code); + }} +}} + +// globals +#define CUBIN_NAME {kernel_name}_cubin +CUmodule {kernel_name}_mod = NULL; +CUfunction {kernel_name}_func = NULL; +unsigned char CUBIN_NAME[{bin_size}] = {{ {bin_data} }}; + + +void unload_{kernel_name}(void) {{ + CUDA_CHECK(cuModuleUnload({kernel_name}_mod)); +}} + +// TODO: some code duplication with `runtime/backend/cuda.c` +void load_{kernel_name}() {{ + int dev = 0; + void *bin = (void *)&CUBIN_NAME; + int shared = {shared}; + CUDA_CHECK(cuModuleLoadData(&{kernel_name}_mod, bin)); + CUDA_CHECK(cuModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}")); + // set dynamic shared memory if necessary + int shared_optin; + CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev)); + if (shared > 49152 && shared_optin > 49152) {{ + CUDA_CHECK(cuFuncSetCacheConfig({kernel_name}_func, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_CHECK(cuFuncSetAttribute({kernel_name}_func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin)) + }} +}} + +/* +{kernel_docstring} +*/ +CUresult {kernel_name}(CUstream stream, {signature}) {{ + if ({kernel_name}_func == NULL) + load_{kernel_name}(); + unsigned int gX = {gridX}; + unsigned int gY = {gridY}; + unsigned int gZ = {gridZ}; + void *args[{num_args}] = {{ {arg_pointers} }}; + // TODO: shared memory + if(gX * gY * gZ > 0) + return cuLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * 32, 1, 1, {shared}, stream, args, NULL); +}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/link.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/link.py new file mode 100644 index 0000000000000000000000000000000000000000..eb39b4bda4dbf30c4acd2e0ac4c2f83af6199a65 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/triton/tools/link.py @@ -0,0 +1,322 @@ +from collections import defaultdict +from pathlib import Path +from typing import Sequence, Union + +from dataclasses import dataclass + + +def _exists(x): + return x is not None + + +class LinkerError(Exception): + pass + + +@dataclass +class KernelLinkerMeta: + orig_kernel_name: str + arg_names: Sequence[str] + arg_ctypes: Sequence[str] + sizes: Sequence[Union[int, None]] + sig_hash: str + triton_suffix: str + suffix: str + num_specs: int + """ number of specialized arguments """ + + +class HeaderParser: + + def __init__(self) -> None: + import re + + # [kernel_name, c signature] + self.linker_directives = re.compile("//[\\s]*tt-linker:[\\s]*([\\w]+):(.+):(.+)") + # [name, hash, suffix] + self.kernel_name = re.compile("^([\\w]+)_([\\w]+)_([\\w]+)$") + # [(type, name)] + self.c_sig = re.compile("[\\s]*(\\w+)\\s(\\w+)[,]?") + # [d|c] + self.arg_suffix = re.compile("[c,d]") + + self.kernels = defaultdict(list) + + def extract_linker_meta(self, header: str): + for ln in header.splitlines(): + if ln.startswith("//"): + m = self.linker_directives.match(ln) + if _exists(m): + ker_name, c_sig, algo_info = m.group(1), m.group(2), m.group(3) + name, sig_hash, suffix = self._match_name(ker_name) + c_types, arg_names = self._match_c_sig(c_sig) + num_specs, sizes = self._match_suffix(suffix, c_sig) + self._add_kernel( + "_".join([name, algo_info]), + KernelLinkerMeta( + orig_kernel_name=name, + arg_names=arg_names, + arg_ctypes=c_types, + sizes=sizes, + sig_hash=sig_hash, + triton_suffix=suffix, + suffix=suffix, + num_specs=num_specs, + ), + ) + + def _match_name(self, ker_name: str): + m = self.kernel_name.match(ker_name) + if _exists(m): + name, sig_hash, suffix = m.group(1), m.group(2), m.group(3) + return name, sig_hash, suffix + raise LinkerError(f"{ker_name} is not a valid kernel name") + + def _match_c_sig(self, c_sig: str): + m = self.c_sig.findall(c_sig) + if len(m): + tys, args = [], [] + for ty, arg_name in m: + tys.append(ty) + args.append(arg_name) + return tys, args + + raise LinkerError(f"{c_sig} is not a valid argument signature") + + def _match_suffix(self, suffix: str, c_sig: str): + args = c_sig.split(",") + s2i = {"c": 1, "d": 16} + num_specs = 0 + sizes = [] + # scan through suffix, first find the index, + # then see if it is followed by d or c + for i in range(len(args)): + pos = suffix.find(str(i)) + if pos == -1: + raise LinkerError(f"{suffix} is not a valid kernel suffix") + pos += len(str(i)) + if self.arg_suffix.match(suffix, pos): + num_specs += 1 + sizes.extend([None] * (i - len(sizes))) + sizes.append(s2i[suffix[pos]]) + pos += 1 + if i < len(args) - 1: + suffix = suffix[pos:] + else: + sizes.extend([None] * (len(args) - len(sizes))) + return num_specs, sizes + + def _add_kernel(self, name: str, ker: KernelLinkerMeta): + if name in self.kernels: + last: KernelLinkerMeta = self.kernels[name][-1] + + for cur, new_ in zip(last.arg_ctypes, ker.arg_ctypes): + if cur != new_: + raise LinkerError( + f"Mismatched signature for kernel {name}: \n\texisting sig is: {','.join(last.arg_ctypes)}\n\tcurrent is: {','.join(ker.arg_ctypes)}" + ) + + self.kernels[name].append(ker) + + +def gen_signature_with_full_args(m): + return ", ".join([f"{ty} {arg}" for ty, arg in zip(m.arg_ctypes, m.arg_names)]) + + +def gen_signature(m): + arg_types = [ty for ty, hint in zip(m.arg_ctypes, m.sizes) if hint != 1] + arg_names = [arg for arg, hint in zip(m.arg_names, m.sizes) if hint != 1] + sig = ", ".join([f"{ty} {arg}" for ty, arg in zip(arg_types, arg_names)]) + return sig + + +# generate declarations of kernels with meta-parameter and constant values +def make_algo_decls(name: str, metas: Sequence[KernelLinkerMeta]) -> str: + return f""" +CUresult {name}(CUstream stream, {gen_signature_with_full_args(metas[-1])}); +void load_{name}(); +void unload_{name}(); + """ + + +# generate declarations of kernels with meta-parameter and constant values +def make_global_decl(meta: KernelLinkerMeta) -> str: + return f""" +CUresult {meta.orig_kernel_name}_default(CUstream stream, {gen_signature_with_full_args(meta)}); +CUresult {meta.orig_kernel_name}(CUstream stream, {gen_signature_with_full_args(meta)}, int algo_id); +void load_{meta.orig_kernel_name}(); +void unload_{meta.orig_kernel_name}(); + """ + + +# generate dispatcher function for kernels with different meta-parameter and constant values +def make_default_algo_kernel(meta: KernelLinkerMeta) -> str: + src = f"CUresult {meta.orig_kernel_name}_default(CUstream stream, {gen_signature_with_full_args(meta)}){{\n" + src += (f" return {meta.orig_kernel_name}(stream, {', '.join(meta.arg_names)}, 0);\n") + src += "}\n" + return src + + +# generate dispatcher function for kernels with different integer value hints +def make_kernel_hints_dispatcher(name: str, metas: Sequence[KernelLinkerMeta]) -> str: + src = f"// launcher for: {name}\n" + for meta in sorted(metas, key=lambda m: -m.num_specs): + src += f"CUresult {meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}(CUstream stream, {gen_signature(meta)});\n" + src += "\n" + + src += (f"CUresult {name}(CUstream stream, {gen_signature_with_full_args(metas[-1])}){{") + src += "\n" + for meta in sorted(metas, key=lambda m: -m.num_specs): + cond_fn = ( # + lambda val, hint: f"({val} % {hint} == 0)" # + if hint == 16 # + else f"({val} == {hint})" # + if hint == 1 # + else None) + conds = " && ".join([ # + cond_fn(val, hint) # + for val, hint in zip(meta.arg_names, meta.sizes) # + if hint is not None + ]) + src += (f" if ({conds})\n" if any(meta.sizes) else "if (1)\n" + ) # Edge case where no specializations hence no dispatching required + arg_names = [arg for arg, hint in zip(meta.arg_names, meta.sizes) if hint != 1] + src += f" return {meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}(stream, {', '.join(arg_names)});\n" + src += "\n" + src += " return CUDA_ERROR_INVALID_VALUE;\n" + src += "}\n" + + for mode in ["load", "unload"]: + src += f"\n// {mode} for: {name}\n" + for meta in sorted(metas, key=lambda m: -m.num_specs): + src += f"void {mode}_{meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}();\n" + src += f"void {mode}_{name}() {{" + src += "\n" + for meta in sorted(metas, key=lambda m: -m.num_specs): + src += (f" {mode}_{meta.orig_kernel_name}_{meta.sig_hash}_{meta.suffix}();\n") + src += "}\n" + return src + + +# generate dispatcher function for kernels with different meta-parameter and constant values +def make_kernel_meta_const_dispatcher(meta: KernelLinkerMeta) -> str: + src = f"CUresult {meta.orig_kernel_name}(CUstream stream, {gen_signature_with_full_args(meta)}, int algo_id){{\n" + src += f" assert (algo_id < (int)sizeof({meta.orig_kernel_name}_kernels));\n" + src += f" return {meta.orig_kernel_name}_kernels[algo_id](stream, {', '.join(meta.arg_names)});\n" + src += "}\n" + return src + + +# generate definition of function pointers of kernel dispatchers based on meta-parameter and constant values +def make_func_pointers(names: str, meta: KernelLinkerMeta) -> str: + # the table of hint dispatchers + src = f"typedef CUresult (*kernel_func_t)(CUstream stream, {gen_signature_with_full_args(meta)});\n" + src += f"kernel_func_t {meta.orig_kernel_name}_kernels[] = {{\n" + for name in names: + src += f" {name},\n" + src += "};\n" + return src + + +# generate definition for load/unload functions for kernels with different meta-parameter and constant values +def make_kernel_load_def(names: str, meta: KernelLinkerMeta) -> str: + src = "" + for mode in ["load", "unload"]: + src += f"void {mode}_{meta.orig_kernel_name}(void){{\n" + for name in names: + src += f" {mode}_{name}();\n" + src += "}\n\n" + return src + + +def make_get_num_algos_decl(meta: KernelLinkerMeta) -> str: + src = f"int {meta.orig_kernel_name}_get_num_algos(void);" + return src + + +def make_get_num_algos_def(meta: KernelLinkerMeta) -> str: + src = f"int {meta.orig_kernel_name}_get_num_algos(void){{\n" + src += f" return (int)sizeof({meta.orig_kernel_name}_kernels);\n" + src += "}\n" + return src + + +desc = """ +Triton ahead-of-time linker: + +This program takes in header files generated by compile.py, and generates a +single entry-point responsible for dispatching the user's input to the right +kernel given the specializations that were compiled. + +Example usage: +python link.py /path/to/headers/*.h -o kernel_name +""" + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser(description=desc) + parser.add_argument( + "headers", + nargs="+", + help="Paths to header files to link. Must include linker directive annotations (autogenerated by ttc)", + ) + parser.add_argument("--out", "-o", type=Path, help="Out filename") + parser.add_argument( + "--prefix", + type=str, + default="", + help="String to prefix kernel dispatcher names", + ) + args = parser.parse_args() + + # metadata + parser = HeaderParser() + includes = [] + for header in args.headers: + h_path = Path(header) + h_str = h_path.read_text() + includes.append(h_path.name) + parser.extract_linker_meta(h_str) + + # generate headers + algo_decls = [make_algo_decls(name, meta) for name, meta in parser.kernels.items()] + meta_lists = [meta for name, meta in parser.kernels.items()] + meta = meta_lists[0][0] + get_num_algos_decl = make_get_num_algos_decl(meta) + global_decl = make_global_decl(meta) + with args.out.with_suffix(".h").open("w") as fp: + out = "#include \n" + out += "\n".join(algo_decls) + out += "\n" + out += get_num_algos_decl + out += "\n" + out += global_decl + fp.write(out) + + # generate source + defs = [make_kernel_hints_dispatcher(name, meta) for name, meta in parser.kernels.items()] + names = [name for name in parser.kernels.keys()] + func_pointers_def = make_func_pointers(names, meta) + meta_const_def = make_kernel_meta_const_dispatcher(meta) + load_unload_def = make_kernel_load_def(names, meta) + get_num_algos_def = make_get_num_algos_def(meta) + default_algo_kernel = make_default_algo_kernel(meta) + with args.out.with_suffix(".c").open("w") as fp: + out = "" + out += "#include \n" + out += "#include \n" + out += "#include \n" + out += "\n" + out += "\n".join(defs) + out += "\n" + out += func_pointers_def + out += "\n" + out += get_num_algos_def + out += "\n" + out += meta_const_def + out += "\n" + out += load_unload_def + out += "\n" + out += default_algo_kernel + fp.write(out) diff --git a/tuning-competition-baseline/configs/exp_manager/sft.yaml b/tuning-competition-baseline/configs/exp_manager/sft.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b6906c3d4b12183c501ba6b44c56425ae61b176 --- /dev/null +++ b/tuning-competition-baseline/configs/exp_manager/sft.yaml @@ -0,0 +1,22 @@ +explicit_log_dir: null +exp_dir: ${exp_dir} +name: ${name} +create_wandb_logger: True +wandb_logger_kwargs: + name: ${name} + project: ${project} + entity: ${entity} +resume_if_exists: True +resume_ignore_no_checkpoint: True +create_checkpoint_callback: True +checkpoint_callback_params: + monitor: val_loss + always_save_nemo: False + save_weights_only: True + save_top_k: 1 + save_last: False + mode: min + save_nemo_on_train_end: False + filename: "{step}" + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + save_best_model: False # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model diff --git a/tuning-competition-baseline/configs/model/llm-jp-3-1.8b.yaml b/tuning-competition-baseline/configs/model/llm-jp-3-1.8b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..beb685eb09e42433d6092990fb625a9f1279703c --- /dev/null +++ b/tuning-competition-baseline/configs/model/llm-jp-3-1.8b.yaml @@ -0,0 +1,71 @@ +max_seq_length: 4096 +restore_from_path: "/path/to/model" + +tensor_model_parallel_size: 2 # intra-layer model parallelism +pipeline_model_parallel_size: 1 # inter-layer model parallelism +resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. +save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. +sync_batch_comm: False +megatron_amp_O2: True +encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model + +## Sequence Parallelism +# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially +# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. +sequence_parallel: False + +## Activation Checkpoint +activations_checkpoint_granularity: selective # 'selective' or 'full' +activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' +# 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity +# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity +activations_checkpoint_num_layers: null # not used with 'selective' +activations_checkpoint_layers_per_pipeline: null +# This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. +gradient_as_bucket_view: False +seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value +use_flash_attention: True # if not None, will match the base model's value + +hidden_dropout: ${dropout} +attention_dropout: ${dropout} +ffn_dropout: ${dropout} + +use_loss_mask: True + +## Transformer Engine +transformer_engine: True +fp8: False # enables fp8 in TransformerLayer forward +fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 +fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID +fp8_margin: 0 # scaling margin +fp8_interval: 1 # scaling update interval +fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor +fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history +reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration +use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + +peft: + peft_scheme: "none" # ["lora", "none"] + restore_from_path: null + lora_tuning: + target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all' + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + +optim: + name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. + lr: ${lr} + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 20 + constant_steps: 100 + min_lr: ${min_lr} diff --git a/tuning-competition-baseline/configs/model/llm-jp-3-13b.yaml b/tuning-competition-baseline/configs/model/llm-jp-3-13b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c58f8e65aee757fddf840f3afb1825e57bf10a6e --- /dev/null +++ b/tuning-competition-baseline/configs/model/llm-jp-3-13b.yaml @@ -0,0 +1,71 @@ +max_seq_length: 4096 +restore_from_path: "/path/to/model" + +tensor_model_parallel_size: 4 # intra-layer model parallelism +pipeline_model_parallel_size: 2 # inter-layer model parallelism +resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. +save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. +sync_batch_comm: False +megatron_amp_O2: True +encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model + +## Sequence Parallelism +# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially +# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. +sequence_parallel: False + +## Activation Checkpoint +activations_checkpoint_granularity: full # 'selective' or 'full' +activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' +# 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity +# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity +activations_checkpoint_num_layers: 1 # not used with 'selective' +activations_checkpoint_layers_per_pipeline: null +# This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. +gradient_as_bucket_view: False +seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value +use_flash_attention: True # if not None, will match the base model's value + +hidden_dropout: ${dropout} +attention_dropout: ${dropout} +ffn_dropout: ${dropout} + +use_loss_mask: True + +## Transformer Engine +transformer_engine: True +fp8: False # enables fp8 in TransformerLayer forward +fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 +fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID +fp8_margin: 0 # scaling margin +fp8_interval: 1 # scaling update interval +fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor +fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history +reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration +use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + +peft: + peft_scheme: "none" # ["lora", "none"] + restore_from_path: null + lora_tuning: + target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all' + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + +optim: + name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. + lr: ${lr} + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 20 + constant_steps: 100 + min_lr: ${min_lr} diff --git a/tuning-competition-baseline/configs/model/llm-jp-3-3.7b.yaml b/tuning-competition-baseline/configs/model/llm-jp-3-3.7b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d6a2f1ffbe0754257c1104a6ea2b48aaf22fe0c --- /dev/null +++ b/tuning-competition-baseline/configs/model/llm-jp-3-3.7b.yaml @@ -0,0 +1,71 @@ +max_seq_length: 4096 +restore_from_path: "/path/to/model" # FIXME: Change this `restore_from_path` to the path of the model you want to resume from. + +tensor_model_parallel_size: 4 # intra-layer model parallelism +pipeline_model_parallel_size: 2 # inter-layer model parallelism +resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. +save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. +sync_batch_comm: False +megatron_amp_O2: True +encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model + +## Sequence Parallelism +# Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially +# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. +sequence_parallel: False + +## Activation Checkpoint +activations_checkpoint_granularity: selective # 'selective' or 'full' +activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' +# 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk at the specified granularity +# 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity +activations_checkpoint_num_layers: null # not used with 'selective' +activations_checkpoint_layers_per_pipeline: null +# This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml. +gradient_as_bucket_view: False +seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value +use_flash_attention: True # if not None, will match the base model's value + +hidden_dropout: ${dropout} +attention_dropout: ${dropout} +ffn_dropout: ${dropout} + +use_loss_mask: True + +## Transformer Engine +transformer_engine: True +fp8: False # enables fp8 in TransformerLayer forward +fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 +fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID +fp8_margin: 0 # scaling margin +fp8_interval: 1 # scaling update interval +fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor +fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history +reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration +use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + +peft: + peft_scheme: "none" # ["lora", "none"] + restore_from_path: null + lora_tuning: + target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all' + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + +optim: + name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. + lr: ${lr} + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 20 + constant_steps: 100 + min_lr: ${min_lr} diff --git a/tuning-competition-baseline/configs/sft.yaml b/tuning-competition-baseline/configs/sft.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1c5e9f227dc06799a931a83b544226b6461f490e --- /dev/null +++ b/tuning-competition-baseline/configs/sft.yaml @@ -0,0 +1,104 @@ +defaults: + - base + - exp_manager: sft + - model: llm-jp-3-13b + - trainer: sft + - _self_ + +data: + train_ds: + data_dir: ${data_dir}/tuning/train + global_batch_size: ${gbs} + micro_batch_size: ${mbs} + validation_ds: + data_dir: ${data_dir}/tuning/dev + global_batch_size: ${gbs} + micro_batch_size: ${mbs} + +# tuning datasets + # max_train_samples: max number of samples to use for training. -1 means all. 0 means not to use. + # split_dev: whether to split the dataset into training and validation datasets. If false, the dataset is used for training only. + # upsampling_factor: upsampling factor for the dataset. 1 means no upsampling. Valid for both training and validation datasets. +datasets: + answer_carefully: + max_train_samples: -1 # -1 means all + split_dev: false + upsampling_factor: 16 + calm3_22b_chat_20241018083433--Qwen2.5_32B_Instruct_20241022115410: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + calm3_22b_chat_20241022133932--Qwen2.5_32B_Instruct_20241024100350: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + calm3_22b_chat_20241022155627--Qwen2.5_32B_Instruct_20241024144245: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + daring_anteater_en: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + flan: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + ichikara: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + logical_math_coding_wizard8x22b: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + multiturn_calm3: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + random_to_fixed_multiturn_calm3: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 + synthetic_jp_en_coding_0: + max_train_samples: -1 + split_dev: true + upsampling_factor: 1 +# number of dev samples are the minimum value of {max_dev_samples, max_dev_ratio * number of dev samples} in the dataset. +max_dev_samples: 1000 +max_dev_ratio: 0.1 + +# hyperparameters +gbs: 64 +mbs: 1 +dropout: 0.0 +lr: 2e-5 +min_lr: 2e-6 + +# other options +use_mpi: false +use_slurm: false # This option should be set to true when using Slurm and MPI. Otherwise, set it to false. + +ignore_hparams_on_save: false + +# constants +hparams_to_ignore_on_save: + - project + - work_dir + - data_dir + - seed + - name + - exp_dir + - run_id + - run_dir + - config_name + - logger + - hparams_to_ignore_on_save + - per_device_train_batch_size + - per_device_eval_batch_size + - gradient_checkpointing + - logging_steps + - eval_steps + - save_steps + - use_mpi + - use_slurm diff --git a/tuning-competition-baseline/scripts/ckpt/convert_llama_nemo_to_hf.py b/tuning-competition-baseline/scripts/ckpt/convert_llama_nemo_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..7963d70777078337173bf90f55b8cfdfb0277efc --- /dev/null +++ b/tuning-competition-baseline/scripts/ckpt/convert_llama_nemo_to_hf.py @@ -0,0 +1,304 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from argparse import ArgumentParser +from collections import OrderedDict + +import torch +from joblib import Parallel, delayed +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import ( + MegatronGPTModel, +) +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.utils import logging +from pytorch_lightning import Trainer +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def convert_layer(model, layer: int, torch_dtype) -> dict: + converted = {} + + param_to_weights = lambda param: param.to(torch_dtype) # noqa + + hidden_size: int = model.cfg.hidden_size + head_num: int = model.cfg.num_attention_heads + ffn_hidden_size: int = model.cfg.ffn_hidden_size + num_query_groups: int = model.cfg.get("num_query_groups", head_num) + + head_size: int = hidden_size // head_num + heads_per_group: int = head_num // num_query_groups + qkv_total_dim: int = head_num + 2 * num_query_groups + + qkv_weights = model.state_dict()[ + f"model.module.decoder.layers.{layer}.self_attention.linear_qkv.weight" + ] + qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size]) + + q_slice = torch.cat( + [ + torch.arange( + (heads_per_group + 2) * i, + (heads_per_group + 2) * i + heads_per_group, + ) + for i in range(num_query_groups) + ] + ) + k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) + v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) + # Example of slices + # 7b: num_query_groups = head_num = 32, + # q_slice = [0, 3, 6, 9 , ... 90, 93] + # k_slice = [1, 4, 7, 10, ... 91, 94] + # v_slice = [2, 5, 8, 11, ... 92, 95] + # 70b (with GQA): num_query_groups = 8, head_num = 64 + # q_slice = [0, 1, .. 6, 7, 10, 11, .. 16, 17, 20, 21, .. 67, 70, ... 76, 77] + # k_slice = [8, 18, 28, ... 68, 78] + # v_slice = [9, 19, 29, ... 69, 79] + + q_weights_base_name: str = f"model.module.layers.{layer}.self_attn.q_proj.weight" + k_weights_base_name: str = f"model.module.layers.{layer}.self_attn.k_proj.weight" + v_weights_base_name: str = f"model.module.layers.{layer}.self_attn.v_proj.weight" + + converted[q_weights_base_name] = param_to_weights( + qkv_weights[q_slice].reshape(-1, hidden_size) + ) + converted[k_weights_base_name] = param_to_weights( + qkv_weights[k_slice].reshape(-1, hidden_size) + ) + converted[v_weights_base_name] = param_to_weights( + qkv_weights[v_slice].reshape(-1, hidden_size) + ) + + # attention dense + o_weight = model.state_dict()[ + f"model.module.decoder.layers.{layer}.self_attention.linear_proj.weight" + ] + o_weight_base_name = f"model.module.layers.{layer}.self_attn.o_proj.weight" + converted[o_weight_base_name] = param_to_weights(o_weight) + + # mlp + mlp_weights = model.state_dict()[ + f"model.module.decoder.layers.{layer}.mlp.linear_fc1.weight" + ] + mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :] + mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :] + + mlp_down_proj_base_name = f"model.module.layers.{layer}.mlp.gate_proj.weight" + mlp_gate_proj_base_name = f"model.module.layers.{layer}.mlp.up_proj.weight" + + converted[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight) + converted[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight) + + mlp_up_proj_weight = model.state_dict()[ + f"model.module.decoder.layers.{layer}.mlp.linear_fc2.weight" + ] + mlp_up_proj_base_name = f"model.module.layers.{layer}.mlp.down_proj.weight" + converted[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight) + + # layernorm + input_ln_weight = model.state_dict()[ + f"model.module.decoder.layers.{layer}.self_attention.linear_qkv.layer_norm_weight" + ] + input_ln_base_name = f"model.module.layers.{layer}.input_layernorm.weight" + converted[input_ln_base_name] = param_to_weights(input_ln_weight) + + post_attn_ln_weight = model.state_dict()[ + f"model.module.decoder.layers.{layer}.mlp.linear_fc1.layer_norm_weight" + ] + post_attn_ln_base_name: str = ( + f"model.module.layers.{layer}.post_attention_layernorm.weight" + ) + converted[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight) + + logging.info(f"Layer {layer} converted") + + return converted + + +def convert( + input_nemo_file: str, + output_hf_file, + precision=None, + cpu_only=False, + n_jobs=1, +) -> torch.dtype: + """ + Convert NeMo weights to HF weights + """ + dummy_trainer = Trainer(devices=1, accelerator="cpu", strategy=NLPDDPStrategy()) + model_config = MegatronGPTModel.restore_from( + input_nemo_file, trainer=dummy_trainer, return_config=True + ) + model_config.tensor_model_parallel_size = 1 + model_config.pipeline_model_parallel_size = 1 + model_config.sequence_parallel = False + if cpu_only: + logging.info( + "******** Loading model on CPU. This will take a significant amount of time." + ) + map_location = torch.device("cpu") + model_config.use_cpu_initialization = True + else: + map_location = None + + model = MegatronGPTModel.restore_from( + input_nemo_file, + trainer=dummy_trainer, + override_config_path=model_config, + map_location=map_location, + ) + if precision is None: + precision = model.cfg.precision + if precision in [32, "32"]: + torch_dtype = torch.float32 + elif precision in [16, "16", "16-mixed"]: + torch_dtype = torch.float16 + elif precision in ["bf16", "bf16-mixed"]: + torch_dtype = torch.bfloat16 + else: + logging.warning( + f"Precision string {precision} is not recognized, falling back to fp32" + ) + torch_dtype = torch.float32 # fallback + logging.info(f"Using precision {torch_dtype}") + + checkpoint = OrderedDict() + + param_to_weights = lambda param: param.to(torch_dtype) # noqa + + # Embedding + embed_weight = model.state_dict()["model.module.embedding.word_embeddings.weight"] + checkpoint["model.embed_tokens.weight"] = param_to_weights(embed_weight) + + # Layers + num_layers: int = model.cfg.num_layers + n_jobs = min(n_jobs, num_layers) + converted_layers: list[dict] = Parallel(n_jobs=n_jobs, prefer="threads")( + delayed(convert_layer)(model, layer, torch_dtype) for layer in range(num_layers) + ) + for converted_layer in converted_layers: + checkpoint.update(converted_layer) + + final_ln_weight = model.state_dict()["model.module.decoder.final_layernorm.weight"] + checkpoint["model.module.norm.weight"] = param_to_weights(final_ln_weight) + output_layer_weight = model.state_dict()["model.module.output_layer.weight"] + checkpoint["lm_head.weight"] = param_to_weights(output_layer_weight) + + if model_config.get("megatron_amp_O2", False): + keys: list[str] = list(checkpoint.keys()) + for key in keys: + checkpoint[key.replace("model.module.", "model.", 1)] = checkpoint.pop(key) + + os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) + torch.save(checkpoint, output_hf_file) + logging.info(f"Weights saved to {output_hf_file}") + + return torch_dtype + + +def replace_hf_weights_and_tokenizer( + weights_file: str, + torch_dtype: torch.dtype, + input_hf_path: str, + output_hf_path: str, +) -> None: + orig_hf_model = AutoModelForCausalLM.from_pretrained( + input_hf_path, torch_dtype=torch_dtype + ) + nemo_weights = torch.load(weights_file) + + tokenizer = AutoTokenizer.from_pretrained(input_hf_path) + tokenizer_length: int = len(tokenizer) + if tokenizer_length > orig_hf_model.config.vocab_size: + logging.warning( + f"Tokenizer length {tokenizer_length} does not match model vocab size {orig_hf_model.config.vocab_size}. Resizing token embeddings." + ) + orig_hf_model.resize_token_embeddings(tokenizer_length) + + orig_hf_model.load_state_dict(nemo_weights) + orig_hf_model.save_pretrained(output_hf_path) + logging.info(f"Full HF model saved to {output_hf_path}") + + tokenizer.save_pretrained(output_hf_path) + logging.info(f"Tokenizer saved to {output_hf_path}") + + +def main(): + parser = ArgumentParser() + parser.add_argument( + "--input-name-or-path", + type=str, + default=None, + required=True, + help="Path to .nemo file or extracted folder", + ) + parser.add_argument( + "--input-hf-path", + type=str, + default="llm-jp/llm-jp-13b-v2.0", + help="Path to the HF model directory to use as input", + ) + parser.add_argument( + "--output-path", + type=str, + default=None, + required=True, + help="Output HF model directory", + ) + parser.add_argument( + "--model-id", + type=str, + default=None, + required=True, + help="Model ID for the Output HF model", + ) + parser.add_argument( + "--precision", + type=str, + default=None, + help="Precision of output weights. Defaults to precision of the input nemo weights (model.cfg.trainer.precision)", + ) + parser.add_argument( + "--cpu-only", + action="store_true", + help="Load model in cpu only. Useful if the model cannot fit in GPU memory, but this option makes the conversion script significantly slower.", + ) + parser.add_argument( + "--n-jobs", + type=int, + default=1, + help="Number of parallel jobs to run. Default is 1. This is for converting the model layers in parallel. If n_jobs > num_layers, it will be set to num_layers.", + ) + args = parser.parse_args() + + output_hf_path: str = f"{args.output_path}/{args.model_id}" + torch_dtype = convert( + input_nemo_file=args.input_name_or_path, + output_hf_file=f"{output_hf_path}/pytorch_model.bin", + precision=args.precision, + cpu_only=args.cpu_only, + n_jobs=args.n_jobs, + ) + replace_hf_weights_and_tokenizer( + weights_file=f"{output_hf_path}/pytorch_model.bin", + torch_dtype=torch_dtype, + input_hf_path=args.input_hf_path, + output_hf_path=output_hf_path, + ) + os.remove(f"{output_hf_path}/pytorch_model.bin") + + +if __name__ == "__main__": + main() \ No newline at end of file